From 294510b120530e2792f822b28d29b8b9ffa74764 Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Tue, 10 Dec 2024 16:28:42 +0000
Subject: [PATCH 01/20] Enable building against the highway library

Highway is added as a forth architecture and can be used by setting `-DARMRAL_ARCH=HWY`. Specific architecture flags can be passed using `-DARMRAL_OPT_FLAGS=-march...`.

The CC GitHub fork of highway is included as a submodule which must be clone when building for highway.

The highway specific source list is configured in `armral_hwy.cmake.in`.
---
 .gitmodules          |    4 +
 CMakeLists.txt       | 1088 +++++++++++++++++++++++++-----------------
 armral_acle.cmake.in |  136 ++++++
 armral_hwy.cmake.in  |  128 +++++
 highway              |    1 +
 include/armral.h     |   22 +
 utils/rng.cpp        |    5 +
 7 files changed, 946 insertions(+), 438 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 armral_acle.cmake.in
 create mode 100644 armral_hwy.cmake.in
 create mode 160000 highway

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e999975
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "highway"]
+	path = highway
+	url = https://github.com/cambridgeconsultants/aeroway
+	branch = aeroway_upstream
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1124e60..15fff4f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,95 +29,9 @@ option(ARMRAL_ENABLE_COVERAGE
 option(BUILD_SIMULATION "Enable building channel simulation programs" ON)
 set(ARMRAL_ARCH
     NEON
-    CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
-set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2")
-
-set(ARMRAL_LIB_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
+    CACHE STRING
+          "The architecture to build for ('NEON', 'SVE', 'SVE2' or 'HWY')")
+set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE" "SVE2" "HWY")
 
 # Per source file compiler flag overrides/additions
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
@@ -267,54 +181,6 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
   set(JOB_POOL_CONSOLE JOB_POOL console)
 endif()
 
-if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # If the optimization flags are already set, don't try and guess what they
-  # should be.
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.5-a+sve2+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.2-a+sve+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "NEON")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8-a+crypto"
-        CACHE INTERNAL "")
-  else()
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # We explicitly set the optimization flags, so just copy those. We still need
-  # to set the appropriate SVE version definition
-  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-else()
-  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-endif()
-
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   set(ARMRAL_COMPILER_FLAGS
       ${ARMRAL_COMPILER_FLAGS}
@@ -344,14 +210,23 @@ else()
   set(ARMRAL_LINKER_FLAGS "")
 endif()
 
-add_library(armral ${ARMRAL_LIB_SOURCES})
+add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
+# The armral library is defined within the include files
+#
+if(ARMRAL_ARCH STREQUAL "HWY")
+  # The armral_utils library will have additional link libraries added within
+  # this include
+  include(armral_hwy.cmake.in)
+else()
+  include(armral_acle.cmake.in)
+endif()
+
 target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
                                       ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
 
-add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
 target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
@@ -476,300 +351,635 @@ if(BUILD_TESTING)
       DEPENDS bench_${BENCH_NAME})
   endfunction()
 
-  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
-  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
-  add_armral_test(arm_solve
-                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
-  add_armral_test(
-    matrix_vector_mult_batch_16
-    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
-  add_armral_test(
-    matrix_vector_mult_batch_32
-    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
-  add_armral_test(matrix_mult_16
-                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
-  add_armral_test(matrix_mult_32
-                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
-  add_armral_test(matrix_mult_aah_32
-                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-  add_armral_test(matrix_mult_ahb_32
-                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-  add_armral_test(
-    matrix_vector_mult_single_16
-    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
-  add_armral_test(
-    matrix_vector_mult_single_32
-    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-  add_armral_test(matrix_pseudo_inv_direct
-                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-  add_armral_test(vec_dot_16_2
-                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-  add_armral_test(vec_dot_16_2_32_bit
-                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-  add_armral_test(vec_dot_16_32_bit
-                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-  add_armral_test(vec_dot_32_2
-                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
-  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
-  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-  add_armral_test(mu_law_compression
-                  test/DuRuInterface/MuLaw/Compression/main.cpp)
-  add_armral_test(mu_law_decompression
-                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
-  add_armral_test(block_float_compression
-                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
-  add_armral_test(block_float_decompression
-                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
-  add_armral_test(block_scaling_compression
-                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
-  add_armral_test(block_scaling_decompression
-                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
-  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
-  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
-  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
-  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
-  add_armral_test(arm_fir_filter_cs16_decimate_2
-                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
-  add_armral_test(arm_fir_filter_cf32_decimate_2
-                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-  add_armral_test(tail_biting_convolutional_decoding
-                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
-  add_armral_test(tail_biting_convolutional_encoding
-                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
-  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
-  add_armral_test(polar_crc_attachment
-                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
-  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
-  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
-  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
-  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
-  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
-  add_armral_test(polar_subchannel_deinterleave
-                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-  add_armral_test(polar_subchannel_interleave
-                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
-  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
-  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
-
-  add_armral_bench(
-    matrix_inv_batch_general
-    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-  add_armral_bench(matrix_inv_batch_general_pa
-                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-  add_armral_bench(
-    matrix_inv_batch_hermitian
-    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-  add_armral_bench(
-    matrix_inv_batch_hermitian_pa
-    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-  add_armral_bench(matrix_inv_single_general
-                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
-  add_armral_bench(matrix_inv_single_hermitian
-                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
-  add_armral_bench(arm_solve_1x2
-                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-  add_armral_bench(arm_solve_1x4
-                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-  add_armral_bench(arm_solve_2x2
-                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-  add_armral_bench(arm_solve_2x4
-                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-  add_armral_bench(arm_solve_4x4
-                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_i16_32b
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_i16_32b_pa
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_i16_64b
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_i16_64b_pa
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_f32
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_batch_f32_pa
-    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-  add_armral_bench(
-    matrix_mult_i16_32b
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-  add_armral_bench(
-    matrix_mult_i16_64b
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-  add_armral_bench(
-    matrix_mult_f32_2x2_iq
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-  add_armral_bench(
-    matrix_mult_f32_2x2
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-  add_armral_bench(
-    matrix_mult_f32_4x4_iq
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-  add_armral_bench(
-    matrix_mult_f32_4x4
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-  add_armral_bench(
-    matmul_f32_general
-    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
-  add_armral_bench(
-    matrix_mult_aah_32
-    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-  add_armral_bench(
-    matrix_mult_ahb_32
-    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_i16_32b
-    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_i16_64b
-    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-  add_armral_bench(
-    matrix_vector_mult_32
-    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-  add_armral_bench(matrix_pseudo_inv_direct
-                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-  add_armral_bench(vec_dot_16
-                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-  add_armral_bench(vec_dot_16_2
-                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-  add_armral_bench(vec_dot_16_2_32_bit
-                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-  add_armral_bench(vec_dot_16_32_bit
-                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-  add_armral_bench(vec_dot_32
-                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-  add_armral_bench(vec_dot_32_2
-                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
-  add_armral_bench(vec_mul_16_2
-                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
-  add_armral_bench(vec_mul_32_2
-                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-  add_armral_bench(mu_law_compression_14bit
-                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
-  add_armral_bench(mu_law_compression_8bit
-                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
-  add_armral_bench(mu_law_compression_9bit
-                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
-  add_armral_bench(mu_law_decompression_14bit
-                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
-  add_armral_bench(mu_law_decompression_8bit
-                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
-  add_armral_bench(mu_law_decompression_9bit
-                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
-  add_armral_bench(
-    block_float_compression_12bit
-    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
-  add_armral_bench(
-    block_float_compression_14bit
-    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
-  add_armral_bench(block_float_compression_8bit
-                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
-  add_armral_bench(block_float_compression_9bit
-                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
-  add_armral_bench(
-    block_float_decompression_12bit
-    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
-  add_armral_bench(
-    block_float_decompression_14bit
-    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
-  add_armral_bench(
-    block_float_decompression_8bit
-    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
-  add_armral_bench(
-    block_float_decompression_9bit
-    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
-  add_armral_bench(
-    block_scaling_compression_14bit
-    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
-  add_armral_bench(
-    block_scaling_compression_8bit
-    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
-  add_armral_bench(
-    block_scaling_compression_9bit
-    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
-  add_armral_bench(
-    block_scaling_decompression_14bit
-    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
-  add_armral_bench(
-    block_scaling_decompression_8bit
-    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
-  add_armral_bench(
-    block_scaling_decompression_9bit
-    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
-  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
-  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
-  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16_decimate_2
-                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32_decimate_2
-                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-  add_armral_bench(tail_biting_convolutional_decoding
-                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-  add_armral_bench(tail_biting_convolutional_encoding
-                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
-  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
-  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
-  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
-  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
-  add_armral_bench(polar_rate_matching
-                   bench/UpperPHY/Polar/RateMatching/main.cpp)
-  add_armral_bench(polar_rate_recovery
-                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
-  add_armral_bench(polar_subchannel_deinterleave
-                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-  add_armral_bench(polar_subchannel_interleave
-                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-  add_armral_bench(turbo_rate_matching
-                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
-  add_armral_bench(turbo_rate_recovery
-                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
-  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+  # Temporary duplication while porting is in progress to maintain the order of
+  # bench_excel_summary output
+  if(ARMRAL_ARCH STREQUAL "HWY")
+    # cmake-format: off
+    # add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+    # add_armral_test(matrix_inv_single
+    #                 test/BasicMathFun/MatrixInv/Single/main.cpp)
+    # add_armral_test(arm_solve
+    #                 test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+    # add_armral_test(
+    #   matrix_vector_mult_batch_16
+    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+    # add_armral_test(
+    #   matrix_vector_mult_batch_32
+    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+    # add_armral_test(matrix_mult_16
+    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+    # add_armral_test(matrix_mult_32
+    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+    # add_armral_test(
+    #   matrix_mult_aah_32
+    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+    # add_armral_test(
+    #   matrix_mult_ahb_32
+    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+    # add_armral_test(
+    #   matrix_vector_mult_single_16
+    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+    # add_armral_test(
+    #   matrix_vector_mult_single_32
+    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+    # add_armral_test(matrix_pseudo_inv_direct
+    #                 test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+    # add_armral_test(vec_dot_16
+    #                 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+    # add_armral_test(vec_dot_16_2
+    #                 test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+    # add_armral_test(vec_dot_16_2_32_bit
+    #                 test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+    # add_armral_test(vec_dot_16_32_bit
+    #                 test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+    # add_armral_test(vec_dot_32
+    #                 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+    # add_armral_test(vec_dot_32_2
+    #                 test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+    # add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+    # add_armral_test(vec_mul_16_2
+    #                 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+    # add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+    # add_armral_test(vec_mul_32_2
+    #                 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+    # add_armral_test(mu_law_compression
+    #                 test/DuRuInterface/MuLaw/Compression/main.cpp)
+    # add_armral_test(mu_law_decompression
+    #                 test/DuRuInterface/MuLaw/Decompression/main.cpp)
+    # add_armral_test(block_float_compression
+    #                 test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+    # add_armral_test(block_float_decompression
+    #                 test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+    # add_armral_test(block_scaling_compression
+    #                 test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+    # add_armral_test(block_scaling_decompression
+    #                 test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+    # add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+    # add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+    # add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+    # add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+    # add_armral_test(arm_fir_filter_cs16_decimate_2
+    #                 test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+    # add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+    # add_armral_test(arm_fir_filter_cf32_decimate_2
+    #                 test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+    # add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+    # add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+    # add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+    # add_armral_test(tail_biting_convolutional_decoding
+    #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
+    # add_armral_test(tail_biting_convolutional_encoding
+    #                 test/UpperPHY/ConvolutionalEncoder/main.cpp)
+    # add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+    # add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+    # add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+    # add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+    # add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+    # add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+    # add_armral_test(polar_crc_attachment
+    #                 test/UpperPHY/Polar/CrcAttachment/main.cpp)
+    # add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+    # add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+    # add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+    # add_armral_test(polar_rate_matching
+    #                 test/UpperPHY/Polar/RateMatching/main.cpp)
+    # add_armral_test(polar_rate_recovery
+    #                 test/UpperPHY/Polar/RateRecovery/main.cpp)
+    # add_armral_test(polar_subchannel_deinterleave
+    #                 test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+    # add_armral_test(polar_subchannel_interleave
+    #                 test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+    # add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+    # add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+    # add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+    # add_armral_test(turbo_rate_matching
+    #                 test/UpperPHY/Turbo/RateMatching/main.cpp)
+    # add_armral_test(turbo_rate_recovery
+    #                 test/UpperPHY/Turbo/RateRecovery/main.cpp)
+    # add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+    # add_armral_bench(
+    #   matrix_inv_batch_general
+    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+    # add_armral_bench(
+    #   matrix_inv_batch_general_pa
+    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+    # add_armral_bench(
+    #   matrix_inv_batch_hermitian
+    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+    # add_armral_bench(
+    #   matrix_inv_batch_hermitian_pa
+    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+    # add_armral_bench(matrix_inv_single_general
+    #                  bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+    # add_armral_bench(
+    #   matrix_inv_single_hermitian
+    #   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+    # add_armral_bench(arm_solve_1x2
+    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+    # add_armral_bench(arm_solve_1x4
+    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+    # add_armral_bench(arm_solve_2x2
+    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+    # add_armral_bench(arm_solve_2x4
+    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+    # add_armral_bench(arm_solve_4x4
+    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_i16_32b
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_i16_32b_pa
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_i16_64b
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_i16_64b_pa
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_f32
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_batch_f32_pa
+    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_i16_32b
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_i16_64b
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_f32_2x2_iq
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_f32_2x2
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_f32_4x4_iq
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_f32_4x4
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+    # add_armral_bench(
+    #   matmul_f32_general
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_aah_32
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+    # add_armral_bench(
+    #   matrix_mult_ahb_32
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_i16_32b
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_i16_64b
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+    # add_armral_bench(
+    #   matrix_vector_mult_32
+    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+    # add_armral_bench(matrix_pseudo_inv_direct
+    #                  bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+    # add_armral_bench(vec_dot_16
+    #                  bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+    # add_armral_bench(vec_dot_16_2
+    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+    # add_armral_bench(vec_dot_16_2_32_bit
+    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+    # add_armral_bench(vec_dot_16_32_bit
+    #                  bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+    # add_armral_bench(vec_dot_32
+    #                  bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+    # add_armral_bench(vec_dot_32_2
+    #                  bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+    # add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+    # add_armral_bench(vec_mul_16_2
+    #                  bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+    # add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+    # add_armral_bench(vec_mul_32_2
+    #                  bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+    # add_armral_bench(mu_law_compression_14bit
+    #                  bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+    # add_armral_bench(mu_law_compression_8bit
+    #                  bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+    # add_armral_bench(mu_law_compression_9bit
+    #                  bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+    # add_armral_bench(mu_law_decompression_14bit
+    #                  bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+    # add_armral_bench(mu_law_decompression_8bit
+    #                  bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+    # add_armral_bench(mu_law_decompression_9bit
+    #                  bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_compression_12bit
+    #   bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_compression_14bit
+    #   bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_compression_8bit
+    #   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_compression_9bit
+    #   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_decompression_12bit
+    #   bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_decompression_14bit
+    #   bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_decompression_8bit
+    #   bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+    # add_armral_bench(
+    #   block_float_decompression_9bit
+    #   bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_compression_14bit
+    #   bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_compression_8bit
+    #   bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_compression_9bit
+    #   bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_decompression_14bit
+    #   bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_decompression_8bit
+    #   bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+    # add_armral_bench(
+    #   block_scaling_decompression_9bit
+    #   bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+    # add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+    # add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+    # add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+    # add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+    # add_armral_bench(arm_fir_filter_cs16_decimate_2
+    #                  bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+    # add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+    # add_armral_bench(arm_fir_filter_cf32_decimate_2
+    #                  bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+    # add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+    # add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+    # add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+    # add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+    # add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+    # add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+    # add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+    # add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+    # add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+    # add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+    # add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+    # add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+    # add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+    # add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+    # add_armral_bench(tail_biting_convolutional_decoding
+    #                  bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+    # add_armral_bench(tail_biting_convolutional_encoding
+    #                  bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+    # add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+    # add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+    # add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+    # add_armral_bench(ldpc_rate_matching
+    #                  bench/UpperPHY/LDPC/RateMatching/main.cpp)
+    # add_armral_bench(ldpc_rate_recovery
+    #                  bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+    # add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+    # add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+    # add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+    # add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+    # add_armral_bench(polar_rate_matching
+    #                  bench/UpperPHY/Polar/RateMatching/main.cpp)
+    # add_armral_bench(polar_rate_recovery
+    #                  bench/UpperPHY/Polar/RateRecovery/main.cpp)
+    # add_armral_bench(polar_subchannel_deinterleave
+    #                  bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+    # add_armral_bench(polar_subchannel_interleave
+    #                  bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+    # add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+    # add_armral_bench(turbo_rate_matching
+    #                  bench/UpperPHY/Turbo/RateMatching/main.cpp)
+    # add_armral_bench(turbo_rate_recovery
+    #                  bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+    # add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+    # cmake-format: on
+  else()
+    add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+    add_armral_test(matrix_inv_single
+                    test/BasicMathFun/MatrixInv/Single/main.cpp)
+    add_armral_test(arm_solve
+                    test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+    add_armral_test(
+      matrix_vector_mult_batch_16
+      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+    add_armral_test(
+      matrix_vector_mult_batch_32
+      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+    add_armral_test(matrix_mult_16
+                    test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+    add_armral_test(matrix_mult_32
+                    test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+    add_armral_test(
+      matrix_mult_aah_32
+      test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+    add_armral_test(
+      matrix_mult_ahb_32
+      test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+    add_armral_test(
+      matrix_vector_mult_single_16
+      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+    add_armral_test(
+      matrix_vector_mult_single_32
+      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+    add_armral_test(matrix_pseudo_inv_direct
+                    test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+    add_armral_test(vec_dot_16
+                    test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+    add_armral_test(vec_dot_16_2
+                    test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+    add_armral_test(vec_dot_16_2_32_bit
+                    test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+    add_armral_test(vec_dot_16_32_bit
+                    test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+    add_armral_test(vec_dot_32
+                    test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+    add_armral_test(vec_dot_32_2
+                    test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+    add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+    add_armral_test(vec_mul_16_2
+                    test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+    add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+    add_armral_test(vec_mul_32_2
+                    test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+    add_armral_test(mu_law_compression
+                    test/DuRuInterface/MuLaw/Compression/main.cpp)
+    add_armral_test(mu_law_decompression
+                    test/DuRuInterface/MuLaw/Decompression/main.cpp)
+    add_armral_test(block_float_compression
+                    test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+    add_armral_test(block_float_decompression
+                    test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+    add_armral_test(block_scaling_compression
+                    test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+    add_armral_test(block_scaling_decompression
+                    test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+    add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+    add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+    add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+    add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+    add_armral_test(arm_fir_filter_cs16_decimate_2
+                    test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+    add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+    add_armral_test(arm_fir_filter_cf32_decimate_2
+                    test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+    add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+    add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+    add_armral_test(tail_biting_convolutional_decoding
+                    test/UpperPHY/ConvolutionalDecoder/main.cpp)
+    add_armral_test(tail_biting_convolutional_encoding
+                    test/UpperPHY/ConvolutionalEncoder/main.cpp)
+    add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+    add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+    add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+    add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+    add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+    add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+    add_armral_test(polar_crc_attachment
+                    test/UpperPHY/Polar/CrcAttachment/main.cpp)
+    add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+    add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+    add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+    add_armral_test(polar_rate_matching
+                    test/UpperPHY/Polar/RateMatching/main.cpp)
+    add_armral_test(polar_rate_recovery
+                    test/UpperPHY/Polar/RateRecovery/main.cpp)
+    add_armral_test(polar_subchannel_deinterleave
+                    test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+    add_armral_test(polar_subchannel_interleave
+                    test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+    add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+    add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+    add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+    add_armral_test(turbo_rate_matching
+                    test/UpperPHY/Turbo/RateMatching/main.cpp)
+    add_armral_test(turbo_rate_recovery
+                    test/UpperPHY/Turbo/RateRecovery/main.cpp)
+    add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+    add_armral_bench(
+      matrix_inv_batch_general
+      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+    add_armral_bench(
+      matrix_inv_batch_general_pa
+      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+    add_armral_bench(
+      matrix_inv_batch_hermitian
+      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+    add_armral_bench(
+      matrix_inv_batch_hermitian_pa
+      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+    add_armral_bench(matrix_inv_single_general
+                     bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+    add_armral_bench(
+      matrix_inv_single_hermitian
+      bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+    add_armral_bench(arm_solve_1x2
+                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+    add_armral_bench(arm_solve_1x4
+                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+    add_armral_bench(arm_solve_2x2
+                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+    add_armral_bench(arm_solve_2x4
+                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+    add_armral_bench(arm_solve_4x4
+                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_i16_32b
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_i16_32b_pa
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_i16_64b
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_i16_64b_pa
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_f32
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_batch_f32_pa
+      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+    add_armral_bench(
+      matrix_mult_i16_32b
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+    add_armral_bench(
+      matrix_mult_i16_64b
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+    add_armral_bench(
+      matrix_mult_f32_2x2_iq
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+    add_armral_bench(
+      matrix_mult_f32_2x2
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+    add_armral_bench(
+      matrix_mult_f32_4x4_iq
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+    add_armral_bench(
+      matrix_mult_f32_4x4
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+    add_armral_bench(
+      matmul_f32_general
+      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+    add_armral_bench(
+      matrix_mult_aah_32
+      bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+    add_armral_bench(
+      matrix_mult_ahb_32
+      bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_i16_32b
+      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_i16_64b
+      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+    add_armral_bench(
+      matrix_vector_mult_32
+      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+    add_armral_bench(matrix_pseudo_inv_direct
+                     bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+    add_armral_bench(vec_dot_16
+                     bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+    add_armral_bench(vec_dot_16_2
+                     bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+    add_armral_bench(vec_dot_16_2_32_bit
+                     bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+    add_armral_bench(vec_dot_16_32_bit
+                     bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+    add_armral_bench(vec_dot_32
+                     bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+    add_armral_bench(vec_dot_32_2
+                     bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+    add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+    add_armral_bench(vec_mul_16_2
+                     bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+    add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+    add_armral_bench(vec_mul_32_2
+                     bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+    add_armral_bench(mu_law_compression_14bit
+                     bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+    add_armral_bench(mu_law_compression_8bit
+                     bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+    add_armral_bench(mu_law_compression_9bit
+                     bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+    add_armral_bench(mu_law_decompression_14bit
+                     bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+    add_armral_bench(mu_law_decompression_8bit
+                     bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+    add_armral_bench(mu_law_decompression_9bit
+                     bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+    add_armral_bench(
+      block_float_compression_12bit
+      bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+    add_armral_bench(
+      block_float_compression_14bit
+      bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+    add_armral_bench(
+      block_float_compression_8bit
+      bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+    add_armral_bench(
+      block_float_compression_9bit
+      bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+    add_armral_bench(
+      block_float_decompression_12bit
+      bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+    add_armral_bench(
+      block_float_decompression_14bit
+      bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+    add_armral_bench(
+      block_float_decompression_8bit
+      bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+    add_armral_bench(
+      block_float_decompression_9bit
+      bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+    add_armral_bench(
+      block_scaling_compression_14bit
+      bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+    add_armral_bench(
+      block_scaling_compression_8bit
+      bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+    add_armral_bench(
+      block_scaling_compression_9bit
+      bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+    add_armral_bench(
+      block_scaling_decompression_14bit
+      bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+    add_armral_bench(
+      block_scaling_decompression_8bit
+      bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+    add_armral_bench(
+      block_scaling_decompression_9bit
+      bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+    add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+    add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+    add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+    add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+    add_armral_bench(arm_fir_filter_cs16_decimate_2
+                     bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+    add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+    add_armral_bench(arm_fir_filter_cf32_decimate_2
+                     bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+    add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+    add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+    add_armral_bench(tail_biting_convolutional_decoding
+                     bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+    add_armral_bench(tail_biting_convolutional_encoding
+                     bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+    add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+    add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+    add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+    add_armral_bench(ldpc_rate_matching
+                     bench/UpperPHY/LDPC/RateMatching/main.cpp)
+    add_armral_bench(ldpc_rate_recovery
+                     bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+    add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+    add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+    add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+    add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+    add_armral_bench(polar_rate_matching
+                     bench/UpperPHY/Polar/RateMatching/main.cpp)
+    add_armral_bench(polar_rate_recovery
+                     bench/UpperPHY/Polar/RateRecovery/main.cpp)
+    add_armral_bench(polar_subchannel_deinterleave
+                     bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+    add_armral_bench(polar_subchannel_interleave
+                     bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+    add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+    add_armral_bench(turbo_rate_matching
+                     bench/UpperPHY/Turbo/RateMatching/main.cpp)
+    add_armral_bench(turbo_rate_recovery
+                     bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+    add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+  endif()
 endif()
 
 if(BUILD_EXAMPLES)
@@ -799,13 +1009,15 @@ if(BUILD_EXAMPLES)
     add_dependencies(run_examples run_${EXAMPLE_EXE})
   endfunction()
 
-  add_armral_example(examples/block_float_9b_example.c)
-  add_armral_example(examples/fft_cf32_example.c 10)
-  add_armral_example(examples/modulation_example.c)
-  add_armral_example(examples/polar_example.cpp 128 100 35)
+  if(NOT ARMRAL_ARCH STREQUAL "HWY")
+    add_armral_example(examples/block_float_9b_example.c)
+    add_armral_example(examples/fft_cf32_example.c 10)
+    add_armral_example(examples/modulation_example.c)
+    add_armral_example(examples/polar_example.cpp 128 100 35)
+  endif()
 endif()
 
-if(BUILD_SIMULATION)
+if(BUILD_SIMULATION AND NOT (ARMRAL_ARCH STREQUAL "HWY"))
   # Include simulation rules and targets This involves building dependencies
   # like AWGN library and OpenMP
   add_subdirectory(simulation)
diff --git a/armral_acle.cmake.in b/armral_acle.cmake.in
new file mode 100644
index 0000000..d1e9c0d
--- /dev/null
+++ b/armral_acle.cmake.in
@@ -0,0 +1,136 @@
+if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # If the optimization flags are already set, don't try and guess what they
+  # should be.
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.5-a+sve2+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.2-a+sve+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "NEON")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8-a+crypto"
+        CACHE INTERNAL "")
+  else()
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # We explicitly set the optimization flags, so just copy those. We still need
+  # to set the appropriate SVE version definition
+  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+else()
+  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+endif()
+
+set(ARMRAL_LIB_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
+
+add_library(armral ${ARMRAL_LIB_SOURCES})
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
new file mode 100644
index 0000000..970d1d2
--- /dev/null
+++ b/armral_hwy.cmake.in
@@ -0,0 +1,128 @@
+cmake_minimum_required(VERSION 3.10)
+
+# TODO possibly switch highway from a submodule to ExternalProject_Add
+set(HWY_ENABLE_CONTRIB
+    OFF
+    CACHE BOOL "Include HWY contrib/ folder")
+set(HWY_ENABLE_EXAMPLES
+    OFF
+    CACHE BOOL "Build HWY examples")
+# set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install HWY library")
+set(HWY_ENABLE_TESTS
+    OFF
+    CACHE BOOL "Enable HWY tests")
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-march=native" COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+
+if(ARMRAL_OPT_FLAGS)
+  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
+  # handle configuring static dispatch for a specified -m string
+  set(HWY_COMPILE_ONLY_STATIC
+      ON
+      CACHE BOOL "")
+  add_compile_options(${ARMRAL_ARCH_COMPILE_OPTIONS})
+elseif(COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+  # pick a less conservative baseline where possible
+  add_compile_options("-march=native")
+endif()
+add_subdirectory(highway)
+
+set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_HWY=1")
+
+if(ARMRAL_OPT_FLAGS)
+  target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
+endif()
+
+set(ARMRAL_LIB_SOURCES
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp
+)
+
+add_library(armral ${ARMRAL_LIB_SOURCES})
+
+target_link_libraries(armral PUBLIC hwy)
+target_link_libraries(armral_utils PUBLIC hwy)
diff --git a/highway b/highway
new file mode 160000
index 0000000..54731f5
--- /dev/null
+++ b/highway
@@ -0,0 +1 @@
+Subproject commit 54731f560d036db8e50b96eefad258bd0b35d50c
diff --git a/include/armral.h b/include/armral.h
index c495fde..9a3b058 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -77,6 +77,7 @@
  *   formats.
  */
 
+#ifndef ARMRAL_ARCH_HWY
 // GCC sometimes complains about use of uninitialized values in arm_neon.h.
 // nothing we can do about that, so ignore it!
 #ifndef __clang__
@@ -90,6 +91,27 @@
 // Restore original warning flags.
 #ifndef __clang__
 #pragma GCC diagnostic pop
+#endif
+#else
+// GCC sometimes complains about declaration shadowing members in arm_neon-inl.h.
+// nothing we can do about that, so ignore it!
+#ifndef __clang__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+
+#include "hwy/highway.h"
+
+// Restore original warning flags.
+#ifndef __clang__
+#pragma GCC diagnostic pop
+#endif
+
+#if !HWY_ARCH_ARM
+using float32_t = float;
+using float64_t = double;
+#endif
+
 #endif
 
 #include <inttypes.h>
diff --git a/utils/rng.cpp b/utils/rng.cpp
index 33887ee..2385df7 100644
--- a/utils/rng.cpp
+++ b/utils/rng.cpp
@@ -2,7 +2,12 @@
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
+#if defined(ARMRAL_ARCH_HWY) && !HWY_ARCH_ARM
+using float32_t = float;
+using float64_t = double;
+#else
 #include <arm_neon.h>
+#endif
 
 #include "rng.hpp"
 
-- 
GitLab


From d54740fb30a80afed7c81bc068cb04599c9cd35e Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Thu, 12 Dec 2024 13:16:57 +0000
Subject: [PATCH 02/20] Add Highway CRC implementation

The Barret reduction did not lend itself to accommodating variable vector sizes so fixed size 128-bit highway vectors are used. Additionally due to the lack of the pmull instruction in SVE and the severe performance overhead of Highway's emulated version SVE targets are disabled for the CRC compilation units. As such all Arm platforms will use the Highway's NEON implementation.

Due to lack of a BroadcastLoad Highway operation vld1q_dup_p64 is implemented by dereference and Set Operation. This incurs a 1 cycle overhead that is visible on the performance with 8 byte input length. Beyond 8 byte inputs the performance is on par with the original implementation, rising to a 10% performance improvement with buffers above 1024 bytes long.
---
 CMakeLists.txt                          |  26 +--
 armral_hwy.cmake.in                     |  40 +++-
 src/UpperPHY/CRC/acle/crc_common.hpp    | 271 +++++++++++++++++++++++
 src/UpperPHY/CRC/crc_common.hpp         | 272 +-----------------------
 src/UpperPHY/CRC/highway/crc_common.hpp | 259 ++++++++++++++++++++++
 5 files changed, 582 insertions(+), 286 deletions(-)
 create mode 100644 src/UpperPHY/CRC/acle/crc_common.hpp
 create mode 100644 src/UpperPHY/CRC/highway/crc_common.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15fff4f..90d7265 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -425,7 +425,7 @@ if(BUILD_TESTING)
     #                 test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
     # add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
     # add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-    # add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
     # add_armral_test(tail_biting_convolutional_decoding
     #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
     # add_armral_test(tail_biting_convolutional_encoding
@@ -624,18 +624,18 @@ if(BUILD_TESTING)
     #                  bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
     # add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
     # add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-    # add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-    # add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-    # add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-    # add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-    # add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-    # add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-    # add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-    # add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-    # add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-    # add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-    # add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-    # add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
     # add_armral_bench(tail_biting_convolutional_decoding
     #                  bench/UpperPHY/ConvolutionalDecoder/main.cpp)
     # add_armral_bench(tail_biting_convolutional_encoding
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index 970d1d2..5a0088f 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -34,6 +34,34 @@ if(ARMRAL_OPT_FLAGS)
   target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
 endif()
 
+# The PMULL instruction is required for CRC which requires the AES extension
+# that is only available under NEON and SVE2 on aarch64. To avoid falling back
+# to generic implementations we fix ourselves on NEON for all Arm platforms
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
+# GCC recognizes the usage of XOR as an associative operation, then it tries to
+# optimize the operation tree in its tree-reassoc pass, but it actually makes
+# the performance much worse. Disabling the tree-assoc pass means that the
+# compiler uses our carefully balanced operation tree instead.
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+  APPEND
+  PROPERTY COMPILE_OPTIONS $<$<C_COMPILER_ID:GNU>:-fno-tree-reassoc>)
+
 set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
@@ -92,12 +120,12 @@ set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
diff --git a/src/UpperPHY/CRC/acle/crc_common.hpp b/src/UpperPHY/CRC/acle/crc_common.hpp
new file mode 100644
index 0000000..47bf69e
--- /dev/null
+++ b/src/UpperPHY/CRC/acle/crc_common.hpp
@@ -0,0 +1,271 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <arm_neon.h>
+
+static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
+  // Sometimes compilers don't realize that they don't need an extra
+  // instruction to extract the 0th lane of a vector, e.g. when doing
+  // vmull_p64(a[0], b[0]), so this just gets around that.
+  poly128_t res;
+  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
+  // If vmull_high_p64 is used, then clang might use a mov to general
+  // purpose registers and back follow by a pmull. This forces the use
+  // of a single pmull2 instruction instead.
+  poly128_t res;
+  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_dup_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
+  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
+  // 1. It isn't available on the earliest GCC version we currently support
+  // 2. If GCC recognizes that this is an associative operation, then it tries
+  //    to optimize the operation tree in its tree-reassoc pass, but it
+  //    actually makes the performance much worse. Hiding it in assembly means
+  //    that the compiler uses our carefully balanced operation tree instead.
+  uint8x16_t res;
+  asm("eor %0.16b, %1.16b, %2.16b"
+      : "=w"(res)
+      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
+  return (poly64x2_t)res;
+}
+
+/**
+ * Computes a CRC64 in big- or little-endian mode using the specified shifts
+ * and polynomials. This can be used for smaller polynomials by shifting
+ * them to a degree 64 polynomial.
+ *
+ * @tparam     BarretShift     the shift used when computing @c ls1_divp.
+ * @param[in]  size            number of bytes of the given buffer
+ * @param[in]  input           points to the input byte sequence
+ * @param[out] crc             the computed CRC
+ * @param[in]  constants       the constants specific to each polynomial:
+                               constants[0] = padding
+                               constants[1] = (1<<128) / P_CRC - (1<<64)
+                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
+                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
+ */
+template<char Endianness>
+static inline __attribute__((always_inline)) void
+crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
+      const poly64_t constants[]) {
+  const poly64_t *p_in = (const poly64_t *)input;
+
+  if (size == 8) {
+    // Special case for <=64 bits
+    poly64x2_t divp_p = vld1q_p64(&constants[1]);
+
+    // This might compile to a separate ldr and dup, which is
+    // fine because the operation using the upper half depends
+    // on the output of the operation using the lower half.
+    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
+    vb = add_p64x2(vb, v11);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load constants for size = 16
+  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
+  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
+  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
+
+  if (size == 16) {
+    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+    poly64x2_t vx1 = add_p64x2(v01, v21);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load the rest of the constants
+  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
+  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
+  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
+
+  if (size == 32) {
+    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
+    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
+    poly64x2_t v01 = add_p64x2(v01a, v01e);
+    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
+  uint32_t init_bytes = size % 64;
+  const poly64_t *p_end = p_in + (size - 16) / 8;
+
+  // These values are carried forwards to the next loop iteration each time.
+  poly64x2_t v01;
+
+  if (init_bytes == 16) {
+    v01 = vdupq_n_p64(0);
+    p_in += 8;
+  } else if (init_bytes == 32) {
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
+    p_in += 10;
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01 = add_p64x2(v01a, v01e);
+  } else if (init_bytes == 48) {
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
+    p_in += 12;
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+
+  } else {
+    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
+    p_in += 14;
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
+
+  if (size <= 64) {
+    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
+  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
+  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
+
+  while (p_in < p_end) {
+    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t vx9 = add_p64x2(v01, v19);
+    poly64x2_t v8x = add_p64x2(v87, v01);
+
+    v19 = load_p64x2<Endianness>(p_in);
+    v87 = load_p64x2<Endianness>(p_in + 2);
+
+    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+    v01b = add_p64x2(v01b, v01bb);
+
+    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+
+    v65 = load_p64x2<Endianness>(p_in + 4);
+    v43 = load_p64x2<Endianness>(p_in + 6);
+    p_in += 8;
+
+    v01a = add_p64x2(v01a, v01aa);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+
+  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+  poly64x2_t vx9 = add_p64x2(v01, v19);
+  poly64x2_t v8x = add_p64x2(v87, v01);
+
+  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+  v01f = add_p64x2(v01f, v01ff);
+  v01e = add_p64x2(v01e, v01ee);
+  v01e = add_p64x2(v01e, v01f);
+
+  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+
+  v01c = add_p64x2(v01c, v01d);
+  v01a = add_p64x2(v01a, v01b);
+  v01e = add_p64x2(v01e, v01g);
+  v01a = add_p64x2(v01a, v01c);
+  v01 = add_p64x2(v01a, v01e);
+
+  poly64x2_t vx1 = add_p64x2(v01, v21);
+
+  // Barret reduction
+  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+  vb = add_p64x2(vb, vx1);
+  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+  v0x = add_p64x2(v0x, v01);
+  *crc = (uint64_t)(v0x[0]);
+}
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index 47bf69e..755761b 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -2,270 +2,8 @@
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
-#pragma once
-
-#include <arm_neon.h>
-
-static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
-  // Sometimes compilers don't realize that they don't need an extra
-  // instruction to extract the 0th lane of a vector, e.g. when doing
-  // vmull_p64(a[0], b[0]), so this just gets around that.
-  poly128_t res;
-  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
-  // If vmull_high_p64 is used, then clang might use a mov to general
-  // purpose registers and back follow by a pmull. This forces the use
-  // of a single pmull2 instruction instead.
-  poly128_t res;
-  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_dup_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
-  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
-  // 1. It isn't available on the earliest GCC version we currently support
-  // 2. If GCC recognizes that this is an associative operation, then it tries
-  //    to optimize the operation tree in its tree-reassoc pass, but it
-  //    actually makes the performance much worse. Hiding it in assembly means
-  //    that the compiler uses our carefully balanced operation tree instead.
-  uint8x16_t res;
-  asm("eor %0.16b, %1.16b, %2.16b"
-      : "=w"(res)
-      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
-  return (poly64x2_t)res;
-}
-
-/**
- * Computes a CRC64 in big- or little-endian mode using the specified shifts
- * and polynomials. This can be used for smaller polynomials by shifting
- * them to a degree 64 polynomial.
- *
- * @tparam     BarretShift     the shift used when computing @c ls1_divp.
- * @param[in]  size            number of bytes of the given buffer
- * @param[in]  input           points to the input byte sequence
- * @param[out] crc             the computed CRC
- * @param[in]  constants       the constants specific to each polynomial:
-                               constants[0] = padding
-                               constants[1] = (1<<128) / P_CRC - (1<<64)
-                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
-                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
- */
-template<char Endianness>
-static inline __attribute__((always_inline)) void
-crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
-      const poly64_t constants[]) {
-  const poly64_t *p_in = (const poly64_t *)input;
-
-  if (size == 8) {
-    // Special case for <=64 bits
-    poly64x2_t divp_p = vld1q_p64(&constants[1]);
-
-    // This might compile to a separate ldr and dup, which is
-    // fine because the operation using the upper half depends
-    // on the output of the operation using the lower half.
-    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
-    vb = add_p64x2(vb, v11);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load constants for size = 16
-  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
-  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
-  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
-
-  if (size == 16) {
-    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-    poly64x2_t vx1 = add_p64x2(v01, v21);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load the rest of the constants
-  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
-  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
-  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
-
-  if (size == 32) {
-    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
-    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
-    poly64x2_t v01 = add_p64x2(v01a, v01e);
-    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
-  uint32_t init_bytes = size % 64;
-  const poly64_t *p_end = p_in + (size - 16) / 8;
-
-  // These values are carried forwards to the next loop iteration each time.
-  poly64x2_t v01;
-
-  if (init_bytes == 16) {
-    v01 = vdupq_n_p64(0);
-    p_in += 8;
-  } else if (init_bytes == 32) {
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
-    p_in += 10;
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01 = add_p64x2(v01a, v01e);
-  } else if (init_bytes == 48) {
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
-    p_in += 12;
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-
-  } else {
-    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
-    p_in += 14;
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
-
-  if (size <= 64) {
-    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
-  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
-  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
-
-  while (p_in < p_end) {
-    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t vx9 = add_p64x2(v01, v19);
-    poly64x2_t v8x = add_p64x2(v87, v01);
-
-    v19 = load_p64x2<Endianness>(p_in);
-    v87 = load_p64x2<Endianness>(p_in + 2);
-
-    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-    v01b = add_p64x2(v01b, v01bb);
-
-    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-
-    v65 = load_p64x2<Endianness>(p_in + 4);
-    v43 = load_p64x2<Endianness>(p_in + 6);
-    p_in += 8;
-
-    v01a = add_p64x2(v01a, v01aa);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-
-  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-  poly64x2_t vx9 = add_p64x2(v01, v19);
-  poly64x2_t v8x = add_p64x2(v87, v01);
-
-  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-  v01f = add_p64x2(v01f, v01ff);
-  v01e = add_p64x2(v01e, v01ee);
-  v01e = add_p64x2(v01e, v01f);
-
-  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-
-  v01c = add_p64x2(v01c, v01d);
-  v01a = add_p64x2(v01a, v01b);
-  v01e = add_p64x2(v01e, v01g);
-  v01a = add_p64x2(v01a, v01c);
-  v01 = add_p64x2(v01a, v01e);
-
-  poly64x2_t vx1 = add_p64x2(v01, v21);
-
-  // Barret reduction
-  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-  vb = add_p64x2(vb, vx1);
-  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-  v0x = add_p64x2(v0x, v01);
-  *crc = (uint64_t)(v0x[0]);
-}
+#ifndef ARMRAL_ARCH_HWY
+#include "acle/crc_common.hpp"
+#else
+#include "highway/crc_common.hpp"
+#endif
diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
new file mode 100644
index 0000000..7b2aad1
--- /dev/null
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -0,0 +1,259 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <hwy/highway.h>
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Allow compilation on non-arm architectures by aliasing poly64_t to an existing type
+// Test if arm_neon.h has been included
+#ifndef _AARCH64_NEON_H_
+using poly64_t = uint64_t;
+#endif
+
+// Operation assumptions
+// vmull_p64 = CLMulLower
+// vmull_high_p64 = CLMulUpper
+// vaddq_p64 = Xor
+// vrev64q_u8 = Reverse8
+// vld1q_p64 = LoadU
+// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced pointer
+
+const hn::Full128<uint64_t> du64x2;
+using DU64x2 = decltype(du64x2);
+using Vec64x2 = hn::Vec<DU64x2>;
+
+template<char Endianness>
+static inline Vec64x2 load_p64x2(const uint64_t *p_in) {
+  Vec64x2 vec = hn::LoadU(du64x2, p_in);
+  if (Endianness == 'b') {
+    const hn::Repartition<uint8_t, DU64x2> d8;
+    vec = hn::BitCast(du64x2, hn::Reverse8(d8, hn::BitCast(d8, vec)));
+  }
+  return vec;
+}
+
+template<char Endianness>
+static inline Vec64x2 load_dup_p64(const uint64_t *p_in) {
+  Vec64x2 vec = hn::Set(du64x2, *p_in);
+  if (Endianness == 'b') {
+    const hn::Repartition<uint8_t, DU64x2> d8;
+    vec = hn::BitCast(du64x2, hn::Reverse8(d8, hn::BitCast(d8, vec)));
+  }
+  return vec;
+}
+
+/**
+ * Computes a CRC64 in big- or little-endian mode using the specified shifts
+ * and polynomials. This can be used for smaller polynomials by shifting
+ * them to a degree 64 polynomial.
+ *
+ * @tparam     BarretShift     the shift used when computing @c ls1_divp.
+ * @param[in]  size            number of bytes of the given buffer
+ * @param[in]  input           points to the input byte sequence
+ * @param[out] crc             the computed CRC
+ * @param[in]  constants       the constants specific to each polynomial:
+                               constants[0] = padding
+                               constants[1] = (1<<128) / P_CRC - (1<<64)
+                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
+                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
+ */
+template<char Endianness>
+HWY_ATTR static inline __attribute__((always_inline)) void
+crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
+      const poly64_t constants[]) {
+  const uint64_t *p_in = input;
+  const uint64_t *constants_u64 = (const uint64_t *)constants;
+
+  if (size == 8) {
+    // Special case for <=64 bits
+    Vec64x2 divp_p = hn::LoadU(du64x2, &constants_u64[1]);
+
+    Vec64x2 v11 = load_dup_p64<Endianness>(p_in);
+
+    // Barret reduction
+    Vec64x2 vb = hn::CLMulLower(v11, divp_p);
+    vb = hn::Xor(vb, v11);
+    Vec64x2 v0x = hn::CLMulUpper(vb, divp_p);
+    *crc = hn::GetLane(v0x);
+    return;
+  }
+
+  // Load constants for size = 16
+  Vec64x2 lsamodp_divp = hn::LoadU(du64x2, &constants_u64[0]);
+  Vec64x2 ls11modp = hn::LoadU(du64x2, &constants_u64[2]);
+  Vec64x2 ls23modp = hn::LoadU(du64x2, &constants_u64[4]);
+
+  if (size == 16) {
+    Vec64x2 v21 = load_p64x2<Endianness>(p_in);
+    Vec64x2 v01 = hn::CLMulLower(v21, ls23modp);
+    Vec64x2 vx1 = hn::Xor(v01, v21);
+
+    // Barret reduction
+    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    vb = hn::Xor(vb, vx1);
+    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    v0x = hn::Xor(v0x, v01);
+    *crc = hn::GetLane(v0x);
+    return;
+  }
+
+  // Load the rest of the constants
+  Vec64x2 ls45modp = hn::LoadU(du64x2, &constants_u64[6]);
+  Vec64x2 ls67modp = hn::LoadU(du64x2, &constants_u64[8]);
+  Vec64x2 ls89modp = hn::LoadU(du64x2, &constants_u64[10]);
+
+  if (size == 32) {
+    Vec64x2 v43a = load_p64x2<Endianness>(p_in);
+    Vec64x2 v19 = load_p64x2<Endianness>(p_in + 2);
+    Vec64x2 v01e = hn::CLMulLower(v43a, ls45modp);
+    Vec64x2 v01a = hn::CLMulUpper(v43a, ls23modp);
+    Vec64x2 v01 = hn::Xor(v01a, v01e);
+    v01a = hn::CLMulLower(v19, ls23modp);
+    v01 = hn::Xor(v01, v01a);
+    Vec64x2 vx1 = hn::Xor(v01, v19);
+
+    // Barret reduction
+    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    vb = hn::Xor(vb, vx1);
+    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    v0x = hn::Xor(v0x, v01);
+    *crc = hn::GetLane(v0x);
+    return;
+  }
+
+  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
+  uint32_t init_bytes = size % 64;
+  const uint64_t *p_end = p_in + (size - 16) / 8;
+
+  // These values are carried forwards to the next loop iteration each time.
+  Vec64x2 v01;
+
+  if (init_bytes == 16) {
+    v01 = hn::Zero(du64x2);
+    p_in += 8;
+  } else if (init_bytes == 32) {
+    Vec64x2 v43 = load_p64x2<Endianness>(p_in);
+    p_in += 10;
+    Vec64x2 v01e = hn::CLMulLower(v43, ls45modp);
+    Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+    v01 = hn::Xor(v01a, v01e);
+  } else if (init_bytes == 48) {
+    Vec64x2 v65 = load_p64x2<Endianness>(p_in);
+    Vec64x2 v43 = load_p64x2<Endianness>(p_in + 2);
+    p_in += 12;
+    Vec64x2 v01g = hn::CLMulLower(v65, ls67modp);
+    Vec64x2 v01e = hn::CLMulUpper(v65, ls45modp);
+    Vec64x2 v01c = hn::CLMulLower(v43, ls45modp);
+    Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+    v01e = hn::Xor(v01e, v01g);
+    v01a = hn::Xor(v01a, v01c);
+    v01 = hn::Xor(v01a, v01e);
+
+  } else {
+    Vec64x2 v87 = load_p64x2<Endianness>(p_in);
+    Vec64x2 v65 = load_p64x2<Endianness>(p_in + 2);
+    Vec64x2 v43 = load_p64x2<Endianness>(p_in + 4);
+    p_in += 14;
+    Vec64x2 v01d = hn::CLMulLower(v87, ls89modp);
+    Vec64x2 v01c = hn::CLMulUpper(v87, ls67modp);
+    Vec64x2 v01b = hn::CLMulLower(v65, ls67modp);
+    Vec64x2 v01a = hn::CLMulUpper(v65, ls45modp);
+    Vec64x2 v01g = hn::CLMulLower(v43, ls45modp);
+    Vec64x2 v01e = hn::CLMulUpper(v43, ls23modp);
+    v01c = hn::Xor(v01c, v01d);
+    v01a = hn::Xor(v01a, v01b);
+    v01e = hn::Xor(v01e, v01g);
+    v01a = hn::Xor(v01a, v01c);
+    v01 = hn::Xor(v01a, v01e);
+  }
+
+  Vec64x2 v19 = load_p64x2<Endianness>(p_in - 8);
+
+  if (size <= 64) {
+    Vec64x2 v01a = hn::CLMulLower(v19, ls23modp);
+    v01 = hn::Xor(v01, v01a);
+    Vec64x2 vx1 = hn::Xor(v01, v19);
+
+    // Barret reduction
+    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    vb = hn::Xor(vb, vx1);
+    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    v0x = hn::Xor(v0x, v01);
+    *crc = hn::GetLane(v0x);
+    return;
+  }
+
+  Vec64x2 v87 = load_p64x2<Endianness>(p_in - 6);
+  Vec64x2 v65 = load_p64x2<Endianness>(p_in - 4);
+  Vec64x2 v43 = load_p64x2<Endianness>(p_in - 2);
+
+  while (p_in < p_end) {
+    Vec64x2 v01bb = hn::CLMulLower(v19, lsamodp_divp);
+    Vec64x2 v01b = hn::CLMulUpper(v87, ls67modp);
+    Vec64x2 vx9 = hn::Xor(v01, v19);
+    Vec64x2 v8x = hn::Xor(v87, v01);
+
+    v19 = load_p64x2<Endianness>(p_in);
+    v87 = load_p64x2<Endianness>(p_in + 2);
+
+    Vec64x2 v01g = hn::CLMulUpper(vx9, ls89modp);
+    Vec64x2 v01e = hn::CLMulLower(v8x, ls89modp);
+
+    v01b = hn::Xor(v01b, v01bb);
+
+    Vec64x2 v01aa = hn::CLMulLower(v65, ls67modp);
+    Vec64x2 v01a = hn::CLMulUpper(v65, ls45modp);
+    Vec64x2 v01d = hn::CLMulLower(v43, ls45modp);
+    Vec64x2 v01c = hn::CLMulUpper(v43, ls23modp);
+
+    v65 = load_p64x2<Endianness>(p_in + 4);
+    v43 = load_p64x2<Endianness>(p_in + 6);
+    p_in += 8;
+
+    v01a = hn::Xor(v01a, v01aa);
+    v01c = hn::Xor(v01c, v01d);
+    v01a = hn::Xor(v01a, v01b);
+    v01e = hn::Xor(v01e, v01g);
+    v01a = hn::Xor(v01a, v01c);
+    v01 = hn::Xor(v01a, v01e);
+  }
+
+  Vec64x2 v21 = load_p64x2<Endianness>(p_in);
+
+  Vec64x2 v01ff = hn::CLMulLower(v19, lsamodp_divp);
+  Vec64x2 v01f = hn::CLMulUpper(v87, ls67modp);
+  Vec64x2 vx9 = hn::Xor(v01, v19);
+  Vec64x2 v8x = hn::Xor(v87, v01);
+
+  Vec64x2 v01ee = hn::CLMulUpper(vx9, ls89modp);
+  Vec64x2 v01e = hn::CLMulLower(v8x, ls89modp);
+
+  v01f = hn::Xor(v01f, v01ff);
+  v01e = hn::Xor(v01e, v01ee);
+  v01e = hn::Xor(v01e, v01f);
+
+  Vec64x2 v01d = hn::CLMulLower(v65, ls67modp);
+  Vec64x2 v01c = hn::CLMulUpper(v65, ls45modp);
+  Vec64x2 v01b = hn::CLMulLower(v43, ls45modp);
+  Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+  Vec64x2 v01g = hn::CLMulLower(v21, ls23modp);
+
+  v01c = hn::Xor(v01c, v01d);
+  v01a = hn::Xor(v01a, v01b);
+  v01e = hn::Xor(v01e, v01g);
+  v01a = hn::Xor(v01a, v01c);
+  v01 = hn::Xor(v01a, v01e);
+
+  Vec64x2 vx1 = hn::Xor(v01, v21);
+
+  // Barret reduction
+  Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+  vb = hn::Xor(vb, vx1);
+  Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+  v0x = hn::Xor(v0x, v01);
+  *crc = hn::GetLane(v0x);
+}
-- 
GitLab


From 335fcc9aba1f3a66cdeb836f1fb7ffc3f3ed0cbf Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Mon, 16 Dec 2024 10:24:36 +0000
Subject: [PATCH 03/20] Add copyright notices to newly created files

---
 src/UpperPHY/CRC/crc_common.hpp         | 2 ++
 src/UpperPHY/CRC/highway/crc_common.hpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index 755761b..e65d27b 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,6 +1,8 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+   Cambridge Consultants Project Reference P5851
 */
 #ifndef ARMRAL_ARCH_HWY
 #include "acle/crc_common.hpp"
diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
index 7b2aad1..56e891c 100644
--- a/src/UpperPHY/CRC/highway/crc_common.hpp
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -1,6 +1,8 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+   Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-- 
GitLab


From 086f3fa84125d03c4f2d5a804f865fcf25164ed3 Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Tue, 17 Dec 2024 11:20:56 +0000
Subject: [PATCH 04/20] Introduce central file for vector types

---
 src/UpperPHY/CRC/highway/crc_common.hpp | 169 ++++++++++++------------
 src/utils/hwy_types.hpp                 |  89 +++++++++++++
 2 files changed, 171 insertions(+), 87 deletions(-)
 create mode 100644 src/utils/hwy_types.hpp

diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
index 56e891c..5877bfa 100644
--- a/src/UpperPHY/CRC/highway/crc_common.hpp
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <hwy/highway.h>
+#include "utils/hwy_types.hpp"
 
 namespace hn = hwy::HWY_NAMESPACE;
 
@@ -24,26 +25,20 @@ using poly64_t = uint64_t;
 // vld1q_p64 = LoadU
 // vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced pointer
 
-const hn::Full128<uint64_t> du64x2;
-using DU64x2 = decltype(du64x2);
-using Vec64x2 = hn::Vec<DU64x2>;
-
 template<char Endianness>
-static inline Vec64x2 load_p64x2(const uint64_t *p_in) {
-  Vec64x2 vec = hn::LoadU(du64x2, p_in);
+static inline Vec_u64x2 load_p64x2(const uint64_t *p_in) {
+  Vec_u64x2 vec = hn::LoadU(du64x2, p_in);
   if (Endianness == 'b') {
-    const hn::Repartition<uint8_t, DU64x2> d8;
-    vec = hn::BitCast(du64x2, hn::Reverse8(d8, hn::BitCast(d8, vec)));
+    vec = hn::BitCast(du64x2, hn::Reverse8(du8x16, hn::BitCast(du8x16, vec)));
   }
   return vec;
 }
 
 template<char Endianness>
-static inline Vec64x2 load_dup_p64(const uint64_t *p_in) {
-  Vec64x2 vec = hn::Set(du64x2, *p_in);
+static inline Vec_u64x2 load_dup_p64(const uint64_t *p_in) {
+  Vec_u64x2 vec = hn::Set(du64x2, *p_in);
   if (Endianness == 'b') {
-    const hn::Repartition<uint8_t, DU64x2> d8;
-    vec = hn::BitCast(du64x2, hn::Reverse8(d8, hn::BitCast(d8, vec)));
+    vec = hn::BitCast(du64x2, hn::Reverse8(du8x16, hn::BitCast(du8x16, vec)));
   }
   return vec;
 }
@@ -72,56 +67,56 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
 
   if (size == 8) {
     // Special case for <=64 bits
-    Vec64x2 divp_p = hn::LoadU(du64x2, &constants_u64[1]);
+    Vec_u64x2 divp_p = hn::LoadU(du64x2, &constants_u64[1]);
 
-    Vec64x2 v11 = load_dup_p64<Endianness>(p_in);
+    Vec_u64x2 v11 = load_dup_p64<Endianness>(p_in);
 
     // Barret reduction
-    Vec64x2 vb = hn::CLMulLower(v11, divp_p);
+    Vec_u64x2 vb = hn::CLMulLower(v11, divp_p);
     vb = hn::Xor(vb, v11);
-    Vec64x2 v0x = hn::CLMulUpper(vb, divp_p);
+    Vec_u64x2 v0x = hn::CLMulUpper(vb, divp_p);
     *crc = hn::GetLane(v0x);
     return;
   }
 
   // Load constants for size = 16
-  Vec64x2 lsamodp_divp = hn::LoadU(du64x2, &constants_u64[0]);
-  Vec64x2 ls11modp = hn::LoadU(du64x2, &constants_u64[2]);
-  Vec64x2 ls23modp = hn::LoadU(du64x2, &constants_u64[4]);
+  Vec_u64x2 lsamodp_divp = hn::LoadU(du64x2, &constants_u64[0]);
+  Vec_u64x2 ls11modp = hn::LoadU(du64x2, &constants_u64[2]);
+  Vec_u64x2 ls23modp = hn::LoadU(du64x2, &constants_u64[4]);
 
   if (size == 16) {
-    Vec64x2 v21 = load_p64x2<Endianness>(p_in);
-    Vec64x2 v01 = hn::CLMulLower(v21, ls23modp);
-    Vec64x2 vx1 = hn::Xor(v01, v21);
+    Vec_u64x2 v21 = load_p64x2<Endianness>(p_in);
+    Vec_u64x2 v01 = hn::CLMulLower(v21, ls23modp);
+    Vec_u64x2 vx1 = hn::Xor(v01, v21);
 
     // Barret reduction
-    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
     vb = hn::Xor(vb, vx1);
-    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp);
     v0x = hn::Xor(v0x, v01);
     *crc = hn::GetLane(v0x);
     return;
   }
 
   // Load the rest of the constants
-  Vec64x2 ls45modp = hn::LoadU(du64x2, &constants_u64[6]);
-  Vec64x2 ls67modp = hn::LoadU(du64x2, &constants_u64[8]);
-  Vec64x2 ls89modp = hn::LoadU(du64x2, &constants_u64[10]);
+  Vec_u64x2 ls45modp = hn::LoadU(du64x2, &constants_u64[6]);
+  Vec_u64x2 ls67modp = hn::LoadU(du64x2, &constants_u64[8]);
+  Vec_u64x2 ls89modp = hn::LoadU(du64x2, &constants_u64[10]);
 
   if (size == 32) {
-    Vec64x2 v43a = load_p64x2<Endianness>(p_in);
-    Vec64x2 v19 = load_p64x2<Endianness>(p_in + 2);
-    Vec64x2 v01e = hn::CLMulLower(v43a, ls45modp);
-    Vec64x2 v01a = hn::CLMulUpper(v43a, ls23modp);
-    Vec64x2 v01 = hn::Xor(v01a, v01e);
+    Vec_u64x2 v43a = load_p64x2<Endianness>(p_in);
+    Vec_u64x2 v19 = load_p64x2<Endianness>(p_in + 2);
+    Vec_u64x2 v01e = hn::CLMulLower(v43a, ls45modp);
+    Vec_u64x2 v01a = hn::CLMulUpper(v43a, ls23modp);
+    Vec_u64x2 v01 = hn::Xor(v01a, v01e);
     v01a = hn::CLMulLower(v19, ls23modp);
     v01 = hn::Xor(v01, v01a);
-    Vec64x2 vx1 = hn::Xor(v01, v19);
+    Vec_u64x2 vx1 = hn::Xor(v01, v19);
 
     // Barret reduction
-    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
     vb = hn::Xor(vb, vx1);
-    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp);
     v0x = hn::Xor(v0x, v01);
     *crc = hn::GetLane(v0x);
     return;
@@ -132,40 +127,40 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
   const uint64_t *p_end = p_in + (size - 16) / 8;
 
   // These values are carried forwards to the next loop iteration each time.
-  Vec64x2 v01;
+  Vec_u64x2 v01;
 
   if (init_bytes == 16) {
     v01 = hn::Zero(du64x2);
     p_in += 8;
   } else if (init_bytes == 32) {
-    Vec64x2 v43 = load_p64x2<Endianness>(p_in);
+    Vec_u64x2 v43 = load_p64x2<Endianness>(p_in);
     p_in += 10;
-    Vec64x2 v01e = hn::CLMulLower(v43, ls45modp);
-    Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+    Vec_u64x2 v01e = hn::CLMulLower(v43, ls45modp);
+    Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp);
     v01 = hn::Xor(v01a, v01e);
   } else if (init_bytes == 48) {
-    Vec64x2 v65 = load_p64x2<Endianness>(p_in);
-    Vec64x2 v43 = load_p64x2<Endianness>(p_in + 2);
+    Vec_u64x2 v65 = load_p64x2<Endianness>(p_in);
+    Vec_u64x2 v43 = load_p64x2<Endianness>(p_in + 2);
     p_in += 12;
-    Vec64x2 v01g = hn::CLMulLower(v65, ls67modp);
-    Vec64x2 v01e = hn::CLMulUpper(v65, ls45modp);
-    Vec64x2 v01c = hn::CLMulLower(v43, ls45modp);
-    Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+    Vec_u64x2 v01g = hn::CLMulLower(v65, ls67modp);
+    Vec_u64x2 v01e = hn::CLMulUpper(v65, ls45modp);
+    Vec_u64x2 v01c = hn::CLMulLower(v43, ls45modp);
+    Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp);
     v01e = hn::Xor(v01e, v01g);
     v01a = hn::Xor(v01a, v01c);
     v01 = hn::Xor(v01a, v01e);
 
   } else {
-    Vec64x2 v87 = load_p64x2<Endianness>(p_in);
-    Vec64x2 v65 = load_p64x2<Endianness>(p_in + 2);
-    Vec64x2 v43 = load_p64x2<Endianness>(p_in + 4);
+    Vec_u64x2 v87 = load_p64x2<Endianness>(p_in);
+    Vec_u64x2 v65 = load_p64x2<Endianness>(p_in + 2);
+    Vec_u64x2 v43 = load_p64x2<Endianness>(p_in + 4);
     p_in += 14;
-    Vec64x2 v01d = hn::CLMulLower(v87, ls89modp);
-    Vec64x2 v01c = hn::CLMulUpper(v87, ls67modp);
-    Vec64x2 v01b = hn::CLMulLower(v65, ls67modp);
-    Vec64x2 v01a = hn::CLMulUpper(v65, ls45modp);
-    Vec64x2 v01g = hn::CLMulLower(v43, ls45modp);
-    Vec64x2 v01e = hn::CLMulUpper(v43, ls23modp);
+    Vec_u64x2 v01d = hn::CLMulLower(v87, ls89modp);
+    Vec_u64x2 v01c = hn::CLMulUpper(v87, ls67modp);
+    Vec_u64x2 v01b = hn::CLMulLower(v65, ls67modp);
+    Vec_u64x2 v01a = hn::CLMulUpper(v65, ls45modp);
+    Vec_u64x2 v01g = hn::CLMulLower(v43, ls45modp);
+    Vec_u64x2 v01e = hn::CLMulUpper(v43, ls23modp);
     v01c = hn::Xor(v01c, v01d);
     v01a = hn::Xor(v01a, v01b);
     v01e = hn::Xor(v01e, v01g);
@@ -173,44 +168,44 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
     v01 = hn::Xor(v01a, v01e);
   }
 
-  Vec64x2 v19 = load_p64x2<Endianness>(p_in - 8);
+  Vec_u64x2 v19 = load_p64x2<Endianness>(p_in - 8);
 
   if (size <= 64) {
-    Vec64x2 v01a = hn::CLMulLower(v19, ls23modp);
+    Vec_u64x2 v01a = hn::CLMulLower(v19, ls23modp);
     v01 = hn::Xor(v01, v01a);
-    Vec64x2 vx1 = hn::Xor(v01, v19);
+    Vec_u64x2 vx1 = hn::Xor(v01, v19);
 
     // Barret reduction
-    Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+    Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
     vb = hn::Xor(vb, vx1);
-    Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+    Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp);
     v0x = hn::Xor(v0x, v01);
     *crc = hn::GetLane(v0x);
     return;
   }
 
-  Vec64x2 v87 = load_p64x2<Endianness>(p_in - 6);
-  Vec64x2 v65 = load_p64x2<Endianness>(p_in - 4);
-  Vec64x2 v43 = load_p64x2<Endianness>(p_in - 2);
+  Vec_u64x2 v87 = load_p64x2<Endianness>(p_in - 6);
+  Vec_u64x2 v65 = load_p64x2<Endianness>(p_in - 4);
+  Vec_u64x2 v43 = load_p64x2<Endianness>(p_in - 2);
 
   while (p_in < p_end) {
-    Vec64x2 v01bb = hn::CLMulLower(v19, lsamodp_divp);
-    Vec64x2 v01b = hn::CLMulUpper(v87, ls67modp);
-    Vec64x2 vx9 = hn::Xor(v01, v19);
-    Vec64x2 v8x = hn::Xor(v87, v01);
+    Vec_u64x2 v01bb = hn::CLMulLower(v19, lsamodp_divp);
+    Vec_u64x2 v01b = hn::CLMulUpper(v87, ls67modp);
+    Vec_u64x2 vx9 = hn::Xor(v01, v19);
+    Vec_u64x2 v8x = hn::Xor(v87, v01);
 
     v19 = load_p64x2<Endianness>(p_in);
     v87 = load_p64x2<Endianness>(p_in + 2);
 
-    Vec64x2 v01g = hn::CLMulUpper(vx9, ls89modp);
-    Vec64x2 v01e = hn::CLMulLower(v8x, ls89modp);
+    Vec_u64x2 v01g = hn::CLMulUpper(vx9, ls89modp);
+    Vec_u64x2 v01e = hn::CLMulLower(v8x, ls89modp);
 
     v01b = hn::Xor(v01b, v01bb);
 
-    Vec64x2 v01aa = hn::CLMulLower(v65, ls67modp);
-    Vec64x2 v01a = hn::CLMulUpper(v65, ls45modp);
-    Vec64x2 v01d = hn::CLMulLower(v43, ls45modp);
-    Vec64x2 v01c = hn::CLMulUpper(v43, ls23modp);
+    Vec_u64x2 v01aa = hn::CLMulLower(v65, ls67modp);
+    Vec_u64x2 v01a = hn::CLMulUpper(v65, ls45modp);
+    Vec_u64x2 v01d = hn::CLMulLower(v43, ls45modp);
+    Vec_u64x2 v01c = hn::CLMulUpper(v43, ls23modp);
 
     v65 = load_p64x2<Endianness>(p_in + 4);
     v43 = load_p64x2<Endianness>(p_in + 6);
@@ -224,25 +219,25 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
     v01 = hn::Xor(v01a, v01e);
   }
 
-  Vec64x2 v21 = load_p64x2<Endianness>(p_in);
+  Vec_u64x2 v21 = load_p64x2<Endianness>(p_in);
 
-  Vec64x2 v01ff = hn::CLMulLower(v19, lsamodp_divp);
-  Vec64x2 v01f = hn::CLMulUpper(v87, ls67modp);
-  Vec64x2 vx9 = hn::Xor(v01, v19);
-  Vec64x2 v8x = hn::Xor(v87, v01);
+  Vec_u64x2 v01ff = hn::CLMulLower(v19, lsamodp_divp);
+  Vec_u64x2 v01f = hn::CLMulUpper(v87, ls67modp);
+  Vec_u64x2 vx9 = hn::Xor(v01, v19);
+  Vec_u64x2 v8x = hn::Xor(v87, v01);
 
-  Vec64x2 v01ee = hn::CLMulUpper(vx9, ls89modp);
-  Vec64x2 v01e = hn::CLMulLower(v8x, ls89modp);
+  Vec_u64x2 v01ee = hn::CLMulUpper(vx9, ls89modp);
+  Vec_u64x2 v01e = hn::CLMulLower(v8x, ls89modp);
 
   v01f = hn::Xor(v01f, v01ff);
   v01e = hn::Xor(v01e, v01ee);
   v01e = hn::Xor(v01e, v01f);
 
-  Vec64x2 v01d = hn::CLMulLower(v65, ls67modp);
-  Vec64x2 v01c = hn::CLMulUpper(v65, ls45modp);
-  Vec64x2 v01b = hn::CLMulLower(v43, ls45modp);
-  Vec64x2 v01a = hn::CLMulUpper(v43, ls23modp);
-  Vec64x2 v01g = hn::CLMulLower(v21, ls23modp);
+  Vec_u64x2 v01d = hn::CLMulLower(v65, ls67modp);
+  Vec_u64x2 v01c = hn::CLMulUpper(v65, ls45modp);
+  Vec_u64x2 v01b = hn::CLMulLower(v43, ls45modp);
+  Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp);
+  Vec_u64x2 v01g = hn::CLMulLower(v21, ls23modp);
 
   v01c = hn::Xor(v01c, v01d);
   v01a = hn::Xor(v01a, v01b);
@@ -250,12 +245,12 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
   v01a = hn::Xor(v01a, v01c);
   v01 = hn::Xor(v01a, v01e);
 
-  Vec64x2 vx1 = hn::Xor(v01, v21);
+  Vec_u64x2 vx1 = hn::Xor(v01, v21);
 
   // Barret reduction
-  Vec64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
+  Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp);
   vb = hn::Xor(vb, vx1);
-  Vec64x2 v0x = hn::CLMulUpper(vb, ls11modp);
+  Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp);
   v0x = hn::Xor(v0x, v01);
   *crc = hn::GetLane(v0x);
 }
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
new file mode 100644
index 0000000..91f4bf1
--- /dev/null
+++ b/src/utils/hwy_types.hpp
@@ -0,0 +1,89 @@
+/*
+   Arm RAN Acceleration Library
+   SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2023-2025
+   Cambridge Consultants Project Reference P5851
+*/
+
+/*
+Defines base vector types for developing using Google Highway for 
+vector instructions. If there is a common vector type which is used
+across files include it here. Otherwise, for specific vector types,
+such as a hn::Rebind should be included locally in source files.
+*/
+
+#pragma once
+
+#include <hwy/highway.h>
+namespace hn = hwy::HWY_NAMESPACE;
+
+/*
+Full128 Vector Types. In general, use these where there is cross-lane
+logic is used (e.g. a pairwise add) or where data-chunking is heavily
+tied to 128-bit blocks.
+*/
+
+// Vector Tags
+const hn::Full128<uint8_t> du8x16;
+const hn::Full128<int8_t> di8x16;
+const hn::Full128<uint16_t> du16x8;
+const hn::Full128<int16_t> di16x8;
+const hn::Full128<uint32_t> du32x4;
+const hn::Full128<int32_t> di32x4;
+const hn::Full128<uint64_t> du64x2;
+const hn::Full128<int64_t> di64x2;
+
+// Vector Types 
+using Vec_u8x16 = hn::Vec<decltype(du8x16)>;
+using Vec_i8x16 = hn::Vec<decltype(di8x16)>;
+using Vec_u16x8 = hn::Vec<decltype(du16x8)>;
+using Vec_i16x8 = hn::Vec<decltype(di16x8)>;
+using Vec_u32x4 = hn::Vec<decltype(du32x4)>;
+using Vec_i32x4 = hn::Vec<decltype(di32x4)>;
+using Vec_u64x2 = hn::Vec<decltype(du64x2)>;
+using Vec_i64x2 = hn::Vec<decltype(di64x2)>;
+
+// Rebind Tags
+/* e.g. const hn::Rebind<int8_t, decltype(d16)> di16x8_di8x16;
+where the first tag named in the rebind tag is the old type
+which the rebind tag is created from and the second is the 
+new tag type. These are used in operations where output vector
+width is different from that of the input. */
+
+
+/*
+Scalable vector types. The default choice should be to use
+these vector types since it allows for processing of more
+data for wider vector widths. Use Full128 for the reasons
+listed above.
+
+Note lack of quantity of vector elements - this is variable.
+Use hn::Lanes(vector_tag) to stride by the correct size when 
+looping over data.
+*/
+
+// Vector Tags
+const hn::ScalableTag<uint8_t> du8;
+const hn::ScalableTag<int8_t> di8;
+const hn::ScalableTag<uint16_t> du16;
+const hn::ScalableTag<int16_t> di16;
+const hn::ScalableTag<uint32_t> du32;
+const hn::ScalableTag<int32_t> di32;
+const hn::ScalableTag<uint64_t> du64;
+const hn::ScalableTag<int64_t> di64;
+
+// Vector Types 
+using Vec_u8 = hn::Vec<decltype(du8)>;
+using Vec_i8 = hn::Vec<decltype(di8)>;
+using Vec_u16 = hn::Vec<decltype(du16)>;
+using Vec_i16 = hn::Vec<decltype(di16)>;
+using Vec_u32 = hn::Vec<decltype(du32)>;
+using Vec_i32 = hn::Vec<decltype(di32)>;
+using Vec_u64 = hn::Vec<decltype(du64)>;
+using Vec_i64 = hn::Vec<decltype(di64)>;
+
+// Rebind Tags
+/* e.g. const hn::Rebind<int8_t, decltype(d16)> di16_di8;
+where the first tag named in the rebind tag is the old type
+which the rebind tag is created from and the second is the 
+new tag type. */
\ No newline at end of file
-- 
GitLab


From 0e112f0cdb7210829e823141025d760469be77fb Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Thu, 19 Dec 2024 11:00:30 +0000
Subject: [PATCH 05/20] Add Highway Sequence Generation and Scrambling
 implementations

Due to the lack of the pmull instruction in SVE and the severe performance overhead of Highway's emulated version, SVE targets are disabled for the Sequence Generation compilation units. As such all Arm platforms will use the Highway's NEON implementation.
---
 CMakeLists.txt                                |   8 +-
 armral_hwy.cmake.in                           |  29 +++--
 .../Scrambling/highway/arm_scrambling.cpp     |  65 +++++++++++
 .../highway/arm_mat_seq_generator.cpp         | 108 ++++++++++++++++++
 src/utils/hwy_types.hpp                       |  26 ++++-
 5 files changed, 220 insertions(+), 16 deletions(-)
 create mode 100644 src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
 create mode 100644 src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90d7265..6a20eb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -423,8 +423,8 @@ if(BUILD_TESTING)
     # add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
     # add_armral_test(arm_fir_filter_cf32_decimate_2
     #                 test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    # add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-    # add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+    add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+    add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
     add_armral_test(crc test/UpperPHY/CRC/main.cpp)
     # add_armral_test(tail_biting_convolutional_decoding
     #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
@@ -622,8 +622,8 @@ if(BUILD_TESTING)
     # add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
     # add_armral_bench(arm_fir_filter_cf32_decimate_2
     #                  bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    # add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-    # add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+    add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+    add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
     add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
     add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
     add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index 5a0088f..e59af20 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -34,16 +34,29 @@ if(ARMRAL_OPT_FLAGS)
   target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
 endif()
 
+# The PMULL instruction is required for CRC and other functions which requires
+# the AES extension that is only available under NEON and SVE2 on aarch64. To
+# avoid falling back to generic implementations we fix ourselves on NEON for all
+# Arm platforms
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
 # The PMULL instruction is required for CRC which requires the AES extension
 # that is only available under NEON and SVE2 on aarch64. To avoid falling back
 # to generic implementations we fix ourselves on NEON for all Arm platforms
 set_property(
-  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
   APPEND
   PROPERTY COMPILE_DEFINITIONS
            HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
@@ -118,8 +131,8 @@ set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
diff --git a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
new file mode 100644
index 0000000..3c3e502
--- /dev/null
+++ b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
@@ -0,0 +1,65 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#include "armral.h"
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Highway vector type
+using Mask_u8 = hn::Mask<decltype(du8)>;
+
+HWY_API void xor_u8(const uint8_t *__restrict &src,
+                    const uint8_t *__restrict &seq, uint8_t *&dst,
+                    size_t n_lanes) {
+  hn::StoreU(hn::Xor(hn::LoadU(du8, src), hn::LoadU(du8, seq)), du8, dst);
+  src += n_lanes;
+  seq += n_lanes;
+  dst += n_lanes;
+}
+
+armral_status armral_scramble_code_block(const uint8_t *__restrict src,
+                                         const uint8_t *__restrict seq,
+                                         uint32_t num_bits, uint8_t *dst) {
+  size_t bytes = (num_bits + 7) >> 3;
+  size_t n_lanes = hn::Lanes(du8);
+
+  // No vectorization for less than 8 bytes.
+  if (bytes < 8) {
+    Mask_u8 early_mask = hn::FirstN(du8, bytes);
+    Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, early_mask, src);
+    Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, early_mask, seq);
+    hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, bytes);
+    return ARMRAL_SUCCESS;
+  }
+
+  size_t rem_bytes = bytes;
+
+  // Use unrolled vectorized loop if there are at least 32 bytes.
+  if (rem_bytes > 31) {
+    uint32_t full_256_vecs = rem_bytes >> 5;
+    rem_bytes = rem_bytes % 32;
+    while (full_256_vecs > 0) {
+      xor_u8(src, seq, dst, n_lanes);
+      xor_u8(src, seq, dst, n_lanes);
+      full_256_vecs--;
+    }
+  }
+
+  // Process 16 of the remaining bytes.
+  if (rem_bytes > 15) {
+    rem_bytes = rem_bytes % 16;
+    xor_u8(src, seq, dst, n_lanes);
+  }
+
+  // Process final partial vector.
+  if (rem_bytes != 0) {
+    Mask_u8 final_mask = hn::FirstN(du8, rem_bytes);
+    Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, final_mask, src);
+    Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, final_mask, seq);
+    hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, rem_bytes);
+  }
+
+  return ARMRAL_SUCCESS;
+}
diff --git a/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
new file mode 100644
index 0000000..9f39abc
--- /dev/null
+++ b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
@@ -0,0 +1,108 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#include "armral.h"
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+template<unsigned int N>
+static inline void generate_seq_128(uint64_t *x) {
+  static_assert(N == 2);
+
+  uint64_t pmask[3] = {0x303, 0x1111, 0xffff000};
+  Vec_u64x2 mask_low_20 = hn::Set(du64x2, 0xfffff);
+  Vec_u64x2 mask_high_16 = hn::Set(du64x2, 0xffff000000000000);
+
+  Vec_u64x2 x_vec = hn::Set(du64x2, *x);
+
+  Vec_u64x2 low_20 = hn::ShiftRight<44>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
+  Vec_u64x2 mid_28 = hn::ShiftRight<16>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
+  Vec_u64x2 high_16 = hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[2]));
+
+  Vec_u64x2 intermediate_result = hn::BitwiseIfThenElse(mask_low_20, low_20, mid_28);
+  *x = hn::GetLane(hn::BitwiseIfThenElse(mask_high_16, high_16, intermediate_result));
+}
+
+template<unsigned int N>
+static inline void generate_seq_64(uint64_t *x) {
+  static_assert((N == 1) || (N == 2));
+
+  uint64_t pmask[3];
+  if (N == 1) {
+    pmask[0] = 0x9;
+    pmask[1] = 0x41;
+    pmask[2] = 0x24900000;
+  } else {
+    pmask[0] = 0xf;
+    pmask[1] = 0x55;
+    pmask[2] = 0x30300000;
+  }
+  Vec_u64x2 mask_low_28 = hn::Set(du64x2, 0xfffffff);
+  Vec_u64x2 mask_high_8 = hn::Set(du64x2, 0xff00000000000000);
+
+  Vec_u64x2 x_vec = hn::Set(du64x2, *x);
+
+  Vec_u64x2 low_28 = hn::ShiftRight<36>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
+  Vec_u64x2 mid_28 = hn::ShiftRight<8>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
+  Vec_u64x2 high_8 = hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[2]));
+
+  Vec_u64x2 intermediate_result = hn::BitwiseIfThenElse(mask_low_28, low_28, mid_28);
+  *x = hn::GetLane(hn::BitwiseIfThenElse(mask_high_8, high_8, intermediate_result));
+}
+
+armral_status armral_seq_generator(uint32_t sequence_len, uint32_t seed,
+                                   uint8_t *p_dst) {
+  
+  uint64_t *p_out = (uint64_t *)p_dst;
+
+  // Set the required masks.
+  Vec_u64x2 mask_28 = hn::Set(du64x2, 0x7ffffff80000000);
+  Vec_u64x2 mask_5 = hn::Set(du64x2, 0xf800000000000000);
+
+  // Set the first 64 bits x2.
+  uint64_t cinit = seed & 0x7fffffff;
+  uint64_t x2 = cinit;
+  x2 |= hn::GetLane(hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)), mask_28));
+  x2 |= hn::GetLane(hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)), mask_5));
+
+  // The sequence x1 is determined according to x1(n+31) = x1(n+3) ^ x1(n)
+  // The initial conditions for x1 are x1(0) = 1, x1(n) = 0, n = 1, 2, ..., 30.
+  // We don't need to calculate the first 1664 bits of x1 (like we do for x2) as
+  // they don't depend on the seed so are always the same. x1 contains bits 1600
+  // to 1663 of the sequence.
+  uint64_t x1 = 0x6ac0a9a45e485840;
+
+  // Generate x2 processing 128 bits at a time. After the loop, x2 will contain
+  // bits 1536 to 1599.
+  //    x2(n+31) = x2(n) + x2(n+1) + x2(n+2) + x2(n+3)
+  for (uint32_t n = 0; n < 12; n++) {
+    generate_seq_128<2>(&x2);
+  }
+  // Generate bits 1600 to 1663 for x2
+  generate_seq_64<2>(&x2);
+
+  uint32_t length = sequence_len / 64;
+  // Generate c(n) = (x1(n+Nc) + x2(n+Nc)), Nc = 1600
+  for (uint32_t n = 0; n < length; n++) {
+    *p_out = x1 ^ x2;
+    p_out++;
+
+    // Next 64 bits of x1 and x2.
+    generate_seq_64<1>(&x1);
+    generate_seq_64<2>(&x2);
+  }
+
+  // Tail
+  if ((sequence_len % 64) != 0) {
+    uint8_t tail_length = ((sequence_len % 64) + 7) >> 3;
+    uint64_t ptemp_res = x1 ^ x2;
+
+    Vec_u64x2 splat_val = hn::Set(du64x2, ptemp_res);
+    Vec_u8x16 splat_val8 = BitCast(du8x16, splat_val);
+    hn::StoreN(splat_val8, du8x16, (uint8_t *)p_out, tail_length);
+  }
+
+  return ARMRAL_SUCCESS;
+}
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 91f4bf1..2658f75 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -33,7 +33,7 @@ const hn::Full128<int32_t> di32x4;
 const hn::Full128<uint64_t> du64x2;
 const hn::Full128<int64_t> di64x2;
 
-// Vector Types 
+// Vector Types
 using Vec_u8x16 = hn::Vec<decltype(du8x16)>;
 using Vec_i8x16 = hn::Vec<decltype(di8x16)>;
 using Vec_u16x8 = hn::Vec<decltype(du16x8)>;
@@ -50,7 +50,6 @@ which the rebind tag is created from and the second is the
 new tag type. These are used in operations where output vector
 width is different from that of the input. */
 
-
 /*
 Scalable vector types. The default choice should be to use
 these vector types since it allows for processing of more
@@ -72,7 +71,7 @@ const hn::ScalableTag<int32_t> di32;
 const hn::ScalableTag<uint64_t> du64;
 const hn::ScalableTag<int64_t> di64;
 
-// Vector Types 
+// Vector Types
 using Vec_u8 = hn::Vec<decltype(du8)>;
 using Vec_i8 = hn::Vec<decltype(di8)>;
 using Vec_u16 = hn::Vec<decltype(du16)>;
@@ -86,4 +85,23 @@ using Vec_i64 = hn::Vec<decltype(di64)>;
 /* e.g. const hn::Rebind<int8_t, decltype(d16)> di16_di8;
 where the first tag named in the rebind tag is the old type
 which the rebind tag is created from and the second is the 
-new tag type. */
\ No newline at end of file
+new tag type. */
+
+/* It has been found that highway implementations of
+MaskedLoad and MaskedLoadU are memory unsafe and will not
+pass address sanitization. As such, we exclude this function
+from sanitization.
+*/
+#if __SANITIZE_ADDRESS__
+#define LOAD_ATTR __attribute__((no_sanitize("address")))
+#else
+#define LOAD_ATTR HWY_API
+#endif
+
+namespace no_sanitize {
+template<class D, class M>
+LOAD_ATTR hn::VFromD<D>
+MaskedLoadU(D d, M m, const hn::TFromD<D> *HWY_RESTRICT unaligned) {
+  return hn::MaskedLoadU(d, m, unaligned);
+}
+} // namespace no_sanitize
\ No newline at end of file
-- 
GitLab


From a6997fb108e422192a993a46a437acfcdbde23a9 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Mon, 6 Jan 2025 11:44:45 +0000
Subject: [PATCH 06/20] Enable SVE2 when compiling the CRC files, as this
 produces acceptable performance when the `sve2-aes` flags is added.

---
 armral_hwy.cmake.in | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index 5a0088f..502cdac 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -34,9 +34,9 @@ if(ARMRAL_OPT_FLAGS)
   target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
 endif()
 
-# The PMULL instruction is required for CRC which requires the AES extension
-# that is only available under NEON and SVE2 on aarch64. To avoid falling back
-# to generic implementations we fix ourselves on NEON for all Arm platforms
+# The PMULL instruction requires the AES extension which is only available under
+# NEON and SVE2 on aarch64. We have disabled SVE for all Arm platforms when
+# PMULL is required; to avoid falling back to (slower) generic implementations
 set_property(
   SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
@@ -45,8 +45,7 @@ set_property(
          ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
   APPEND
-  PROPERTY COMPILE_DEFINITIONS
-           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+  PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE_256|HWY_SVE)
 
 # GCC recognizes the usage of XOR as an associative operation, then it tries to
 # optimize the operation tree in its tree-reassoc pass, but it actually makes
-- 
GitLab


From e5b175d0c1ef73e77cbff2c39ef09e0f0325ddac Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Wed, 8 Jan 2025 09:40:18 +0000
Subject: [PATCH 07/20] Fix scrambling for non-128-bit vector sizes.

Using scalable tags rather than fixed tags incurs an overhead, particularly on SVE.
However, this is expected to see improved performance on platforms with larger vector sizes.
---
 .../Scrambling/highway/arm_scrambling.cpp     | 59 +++++++++++--------
 .../highway/arm_mat_seq_generator.cpp         |  4 +-
 src/utils/hwy_types.hpp                       | 13 ++--
 3 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
index 3c3e502..1f80200 100644
--- a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
@@ -1,16 +1,15 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
 #include "utils/hwy_types.hpp"
 namespace hn = hwy::HWY_NAMESPACE;
 
-// Highway vector type
-using Mask_u8 = hn::Mask<decltype(du8)>;
-
-HWY_API void xor_u8(const uint8_t *__restrict &src,
+HWY_FORCED_INLINE void xor_u8(const uint8_t *__restrict &src,
                     const uint8_t *__restrict &seq, uint8_t *&dst,
                     size_t n_lanes) {
   hn::StoreU(hn::Xor(hn::LoadU(du8, src), hn::LoadU(du8, seq)), du8, dst);
@@ -19,46 +18,60 @@ HWY_API void xor_u8(const uint8_t *__restrict &src,
   dst += n_lanes;
 }
 
+HWY_FORCED_INLINE void xor_u8_partial(const uint8_t *__restrict &src,
+                    const uint8_t *__restrict &seq, uint8_t *&dst,
+                    size_t n_lanes) {
+  Mask_u8 final_mask = hn::FirstN(du8, n_lanes);
+  Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, final_mask, src);
+  Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, final_mask, seq);
+  hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, n_lanes);
+}
+
 armral_status armral_scramble_code_block(const uint8_t *__restrict src,
                                          const uint8_t *__restrict seq,
                                          uint32_t num_bits, uint8_t *dst) {
   size_t bytes = (num_bits + 7) >> 3;
   size_t n_lanes = hn::Lanes(du8);
 
-  // No vectorization for less than 8 bytes.
-  if (bytes < 8) {
-    Mask_u8 early_mask = hn::FirstN(du8, bytes);
-    Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, early_mask, src);
-    Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, early_mask, seq);
-    hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, bytes);
+  // Short-circuit if less than a full vector
+  if (bytes < n_lanes) {
+    xor_u8_partial(src, seq, dst, bytes);
     return ARMRAL_SUCCESS;
   }
 
   size_t rem_bytes = bytes;
+  size_t n_qlanes = n_lanes << 2;
+  size_t n_dlanes = n_lanes << 1;
 
-  // Use unrolled vectorized loop if there are at least 32 bytes.
-  if (rem_bytes > 31) {
-    uint32_t full_256_vecs = rem_bytes >> 5;
-    rem_bytes = rem_bytes % 32;
-    while (full_256_vecs > 0) {
+  // Use unrolled vectorized loop if there are at least 4 full vectors of data.
+  if (rem_bytes > n_qlanes - 1) {
+    uint32_t full_quad_vecs = rem_bytes / n_qlanes;
+    rem_bytes = rem_bytes % n_qlanes;
+    while (full_quad_vecs > 0) {
+      xor_u8(src, seq, dst, n_lanes);
       xor_u8(src, seq, dst, n_lanes);
       xor_u8(src, seq, dst, n_lanes);
-      full_256_vecs--;
+      xor_u8(src, seq, dst, n_lanes);
+      full_quad_vecs--;
     }
   }
 
-  // Process 16 of the remaining bytes.
-  if (rem_bytes > 15) {
-    rem_bytes = rem_bytes % 16;
+  // Use unrolled vectorized loop if there are at least 2 full vectors of data.
+  if (rem_bytes > n_dlanes - 1) {
+    rem_bytes = rem_bytes % n_dlanes;
+    xor_u8(src, seq, dst, n_lanes);
+    xor_u8(src, seq, dst, n_lanes);
+  }
+
+  // Process any final full vectors.
+  if (rem_bytes > n_lanes - 1) {
+    rem_bytes = rem_bytes % n_lanes;
     xor_u8(src, seq, dst, n_lanes);
   }
 
   // Process final partial vector.
   if (rem_bytes != 0) {
-    Mask_u8 final_mask = hn::FirstN(du8, rem_bytes);
-    Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, final_mask, src);
-    Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, final_mask, seq);
-    hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, rem_bytes);
+    xor_u8_partial(src, seq, dst, rem_bytes);
   }
 
   return ARMRAL_SUCCESS;
diff --git a/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
index 9f39abc..044b682 100644
--- a/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+++ b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
@@ -1,6 +1,8 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
@@ -54,7 +56,7 @@ static inline void generate_seq_64(uint64_t *x) {
 
 armral_status armral_seq_generator(uint32_t sequence_len, uint32_t seed,
                                    uint8_t *p_dst) {
-  
+
   uint64_t *p_out = (uint64_t *)p_dst;
 
   // Set the required masks.
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 2658f75..2ef5cab 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -6,7 +6,7 @@
 */
 
 /*
-Defines base vector types for developing using Google Highway for 
+Defines base vector types for developing using Google Highway for
 vector instructions. If there is a common vector type which is used
 across files include it here. Otherwise, for specific vector types,
 such as a hn::Rebind should be included locally in source files.
@@ -17,6 +17,8 @@ such as a hn::Rebind should be included locally in source files.
 #include <hwy/highway.h>
 namespace hn = hwy::HWY_NAMESPACE;
 
+#define HWY_FORCED_INLINE static HWY_INLINE HWY_FLATTEN
+
 /*
 Full128 Vector Types. In general, use these where there is cross-lane
 logic is used (e.g. a pairwise add) or where data-chunking is heavily
@@ -46,7 +48,7 @@ using Vec_i64x2 = hn::Vec<decltype(di64x2)>;
 // Rebind Tags
 /* e.g. const hn::Rebind<int8_t, decltype(d16)> di16x8_di8x16;
 where the first tag named in the rebind tag is the old type
-which the rebind tag is created from and the second is the 
+which the rebind tag is created from and the second is the
 new tag type. These are used in operations where output vector
 width is different from that of the input. */
 
@@ -57,7 +59,7 @@ data for wider vector widths. Use Full128 for the reasons
 listed above.
 
 Note lack of quantity of vector elements - this is variable.
-Use hn::Lanes(vector_tag) to stride by the correct size when 
+Use hn::Lanes(vector_tag) to stride by the correct size when
 looping over data.
 */
 
@@ -81,10 +83,13 @@ using Vec_i32 = hn::Vec<decltype(di32)>;
 using Vec_u64 = hn::Vec<decltype(du64)>;
 using Vec_i64 = hn::Vec<decltype(di64)>;
 
+// Mask Types
+using Mask_u8 = hn::Mask<decltype(du8)>;
+
 // Rebind Tags
 /* e.g. const hn::Rebind<int8_t, decltype(d16)> di16_di8;
 where the first tag named in the rebind tag is the old type
-which the rebind tag is created from and the second is the 
+which the rebind tag is created from and the second is the
 new tag type. */
 
 /* It has been found that highway implementations of
-- 
GitLab


From 6dbe538f7215155add57df861cab1f22c713c060 Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Thu, 9 Jan 2025 15:08:27 +0000
Subject: [PATCH 08/20] Port modulation and demodulation to Highways.

Performance is similar to the ACLE implementation with notable exceptions:
- QAM64 demodulation uses a scalar tail causing lower performance on small sizes that are not a multiple of 4.
- QAM16 modulation uses memcpy for vector sizes below 256 bits since memcpy outperforms the vector implementation.
- Modulation on NEON performs worse due to the higher overhead of masking used compared to the largely unvectorised previous implementation.
---
 CMakeLists.txt                                |   8 +-
 armral_hwy.cmake.in                           |  35 +-
 .../Demodulation/highway/arm_demodulation.cpp | 498 ++++++++++++
 .../Modulation/highway/arm_modulation.cpp     | 766 ++++++++++++++++++
 src/utils/acle/bits_to_bytes.hpp              | 128 +++
 src/utils/bits_to_bytes.hpp                   | 130 +--
 src/utils/highway/bits_to_bytes.hpp           | 131 +++
 src/utils/hwy_types.hpp                       |  30 +-
 8 files changed, 1580 insertions(+), 146 deletions(-)
 create mode 100644 src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
 create mode 100644 src/UpperPHY/Modulation/highway/arm_modulation.cpp
 create mode 100644 src/utils/acle/bits_to_bytes.hpp
 create mode 100644 src/utils/highway/bits_to_bytes.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20eb8..bc25495 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,12 +430,12 @@ if(BUILD_TESTING)
     #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
     # add_armral_test(tail_biting_convolutional_encoding
     #                 test/UpperPHY/ConvolutionalEncoder/main.cpp)
-    # add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+    add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
     # add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
     # add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
     # add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
     # add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-    # add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+    add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
     # add_armral_test(polar_crc_attachment
     #                 test/UpperPHY/Polar/CrcAttachment/main.cpp)
     # add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
@@ -640,14 +640,14 @@ if(BUILD_TESTING)
     #                  bench/UpperPHY/ConvolutionalDecoder/main.cpp)
     # add_armral_bench(tail_biting_convolutional_encoding
     #                  bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-    # add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+    add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
     # add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
     # add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
     # add_armral_bench(ldpc_rate_matching
     #                  bench/UpperPHY/LDPC/RateMatching/main.cpp)
     # add_armral_bench(ldpc_rate_recovery
     #                  bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-    # add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+    add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
     # add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
     # add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
     # add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index e59af20..0d4bc8c 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -34,20 +34,10 @@ if(ARMRAL_OPT_FLAGS)
   target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
 endif()
 
-# The PMULL instruction is required for CRC and other functions which requires
-# the AES extension that is only available under NEON and SVE2 on aarch64. To
-# avoid falling back to generic implementations we fix ourselves on NEON for all
-# Arm platforms
-set_property(
-  SOURCE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS
-           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
-
-# The PMULL instruction is required for CRC which requires the AES extension
-# that is only available under NEON and SVE2 on aarch64. To avoid falling back
-# to generic implementations we fix ourselves on NEON for all Arm platforms
+# The PMULL instruction is required for CRC and others which requires the AES
+# extension that is only available under NEON and SVE2 on aarch64. To avoid
+# falling back to generic implementations we fix ourselves on NEON for all Arm
+# platforms
 set_property(
   SOURCE
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
@@ -61,6 +51,19 @@ set_property(
   PROPERTY COMPILE_DEFINITIONS
            HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
 
+# The SQRDMULH instruction required by demodulation for fixed point
+# multiplication is only available under NEON and SVE2 on aarch64. We have
+# disabled SVE for all Arm platforms when SQRDMULH is required; to avoid falling
+# back to (slower) generic implementations. Additionally disable SVE2 for all
+# Arm platforms for demodulation as the SVE implementation of the
+# OrderedDemote2To operation adds a ~40% overhead to demodulation.
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
 # GCC recognizes the usage of XOR as an associative operation, then it tries to
 # optimize the operation tree in its tree-reassoc pass, but it actually makes
 # the performance much worse. Disabling the tree-assoc pass means that the
@@ -141,12 +144,12 @@ set(ARMRAL_LIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
diff --git a/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
new file mode 100644
index 0000000..72c0779
--- /dev/null
+++ b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
@@ -0,0 +1,498 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#include "armral.h"
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+HWY_FORCED_INLINE int8_t sat_mul_double(int16_t a, int16_t b) {
+  int32_t value = (2 * (int32_t)a) * (int32_t)b;
+  value = (value + 0x8000) >> 16;
+  if (value > INT8_MAX) {
+    return INT8_MAX;
+  }
+  if (value < INT8_MIN) {
+    return INT8_MIN;
+  }
+  return (int8_t)value;
+}
+
+HWY_FORCED_INLINE Vec_i8 generate_partial_llrs(const Vec_i16 rec_a,
+                                           const Vec_i16 rec_b,
+                                           const Vec_i16 weight_v) {
+  /* Computing L(c[n]/r) and L(c[n+1]/r) */
+  Vec_i16 llr16_1a = hn::MulFixedPoint15(rec_a, weight_v);
+  Vec_i16 llr16_1b = hn::MulFixedPoint15(rec_b, weight_v);
+
+  return hn::OrderedDemote2To(di8, llr16_1a, llr16_1b);
+}
+
+HWY_FORCED_INLINE Vec_i8_half
+generate_partial_llrs_half_vect(const Vec_i16 rec_a, const Vec_i16 weight_v) {
+  /* Computing L(c[n]/r) */
+  Vec_i16 llr16_1a = hn::MulFixedPoint15(rec_a, weight_v);
+
+  return hn::DemoteTo(di8_half, llr16_1a);
+}
+
+HWY_FORCED_INLINE armral_status
+armral_demodulation_qpsk(const uint32_t n_symbols, const uint16_t ulp,
+                         const armral_cmplx_int16_t *p_src, int8_t *p_dst) {
+  // The log likelihood ratio of a bit being received as 1 is directly
+  // proportional to the modulated symbol received
+  const int16_t weight = (1 << 15) / ulp;
+  const Vec_i16 weight_v = hn::Set(di16, weight);
+
+  /* Compute 8 complex symbols at a time */
+  uint32_t blk_cnt = n_symbols / hn::Lanes(di16);
+  while (blk_cnt > 0U) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+    Vec_i16 rec_b = hn::LoadU(di16, (const int16_t *)p_src + hn::Lanes(di16));
+
+    Vec_i8 llr8_b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    hn::StoreU(llr8_b, di8, p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16);
+    p_dst += hn::Lanes(di8);
+    blk_cnt--;
+  }
+
+  uint32_t tail_cnt = n_symbols & (hn::Lanes(di16) - 1);
+
+  /* Compute 4 complex symbols at a time */
+  if (tail_cnt >= hn::Lanes(di16_half)) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+
+    Vec_i8_half llr8_a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    hn::StoreU(llr8_a, di8_half, p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16_half);
+    p_dst += hn::Lanes(di8_half);
+    tail_cnt -= hn::Lanes(di16_half);
+  }
+
+  /* Compute remaining symbols */
+  if (tail_cnt > 0U) {
+    size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
+    Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
+    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+
+    Vec_i8_half llr8_a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    hn::StoreN(llr8_a, di8_half, p_dst, iteration_symbols);
+
+    // // Twice this number of int16 but count is for armral_cmplx_int16_t
+    // p_src += iteration_symbols / 2;
+    // p_dst += iteration_symbols;
+    // tail_cnt -= iteration_symbols / 2;
+  }
+  return ARMRAL_SUCCESS;
+}
+
+/*
+  * Inline function for 16QAM demodulation
+ * @par       LLRs calculation for 16QAM
+              LLRs calculation for 16QAM, having received a complex symbol rx =
+ rx_re + j*rx_im. The LLRs calculations are made approximately with thresholds
+ method, to have a complexity of O(m). 16QAM Gray mapping following 3GPP
+ TS 38.211 V15.2.0, Ch. 5.1 Modulation mapper Bits position[c0 c1 c2 c3]
+
+              LLR(c0|r) = weight * [-rx_re]
+              LLR(c1|r) = weight * [-rx_im]
+              LLR(c2|r) = weight * [|rx_re| - 2/sqrt(10)]
+              LLR(c3|r) = weight * [|rx_im| - 2/sqrt(10)]
+  *
+*/
+
+HWY_FORCED_INLINE armral_status
+armral_demodulation_16qam(const uint32_t n_symbols, const uint16_t ulp,
+                          const armral_cmplx_int16_t *p_src, int8_t *p_dst) {
+  // THR_16QAM = 2/sqrt(10) [Q(2.13)] due to 3GPP Gray Mapping for 16QAM
+  // modulation
+#define THR_16QAM (5181)
+
+  // The log likelihood ratio of a bit being received as 1 is directly
+  // proportional to the modulated symbol received
+  const int16_t weight = (1 << 15) / ulp;
+  const Vec_i16 weight_v = hn::Set(di16, weight);
+
+  const Vec_i16 thrs = hn::Set(di16, THR_16QAM);
+
+  /* Compute 8 complex symbols at a time */
+  uint32_t blk_cnt = n_symbols / hn::Lanes(di16);
+  while (blk_cnt > 0U) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+    Vec_i16 rec_b = hn::LoadU(di16, (const int16_t *)p_src + hn::Lanes(di16));
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8 llr8_1b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thrs, hn::Abs(rec_a));
+    rec_b = hn::Sub(thrs, hn::Abs(rec_b));
+
+    Vec_i8 llr8_2b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved2(hn::BitCast(du16, llr8_1b), hn::BitCast(du16, llr8_2b), du16, (uint16_t *)p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16);
+    p_dst += 2 * hn::Lanes(di8);
+    blk_cnt--;
+  }
+
+  uint32_t tail_cnt = n_symbols & (hn::Lanes(di16) - 1);
+
+  /* Compute 4 complex symbols at a time */
+  if (tail_cnt >= hn::Lanes(di16_half)) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thrs, hn::Abs(rec_a));
+    Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved2(hn::BitCast(du16_half, llr8_1a), hn::BitCast(du16_half, llr8_2a), du16_half, (uint16_t *)p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16_half);
+    p_dst += 2 * hn::Lanes(di8_half);
+    tail_cnt -= hn::Lanes(di16_half);
+  }
+
+  if (tail_cnt > 0U) {
+    size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
+    Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
+    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thrs, hn::Abs(rec_a));
+    Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    Vec_i8 llr8_1a_full = hn::ZeroExtendVector(di8, llr8_1a);
+    Vec_i8 llr8_2a_full = hn::ZeroExtendVector(di8, llr8_2a);
+    Vec_u16 llr8_interleaved = hn::InterleaveWholeLower(
+        hn::BitCast(du16, llr8_1a_full), hn::BitCast(du16, llr8_2a_full));
+    hn::StoreN(llr8_interleaved, du16, (uint16_t *)p_dst, iteration_symbols);
+
+    // // Twice this number of int16 but count is for armral_cmplx_int16_t
+    // p_src += iteration_symbols / 2;
+    // p_dst += 2 * iteration_symbols;
+    // tail_cnt -= iteration_symbols / 2;
+  }
+  return ARMRAL_SUCCESS;
+}
+
+/*
+ * Inline function for 64QAM demodulation
+ * @par       LLRs calculation for 64QAM
+              LLRs calculation for 64QAM, having received a complex symbol rx =
+ rx_re + j*rx_im. The LLRs calculations are made approximately with thresholds
+ method, to have a complexity of O(m). 64QAM Gray mapping following 3GPP
+ TS 38.211 V15.2.0, Ch. 5.1 Modulation mapper Bits position[c0 c1 c2 c3 c4 c5]
+
+              LLR(c0|r) = weight * [-rx_re]
+              LLR(c1|r) = weight * [-rx_im]
+              LLR(c2|r) = weight * [|rx_re| - 4/sqrt(42)]
+              LLR(c3|r) = weight * [|rx_im| - 4/sqrt(42)]
+              LLR(c4|r) = weight * [||rx_re| - 4/sqrt(42)| - 2/sqrt(42)]
+              LLR(c5|r) = weight * [||rx_im| - 4/sqrt(42)| - 2/sqrt(42)]
+  *
+*/
+
+HWY_FORCED_INLINE armral_status
+armral_demodulation_64qam(const uint32_t n_symbols, const uint16_t ulp,
+                          const armral_cmplx_int16_t *p_src, int8_t *p_dst) {
+  // THR_64QAM_1 = 4/sqrt(42) [Q(2.13) format], due to 3GPP Gray Mapping for
+  // 64QAM modulation
+#define THR_64QAM_1 (5056)
+  // = 2/sqrt(42) [Q(2.13) format], due to 3GPP Gray Mapping for 64QAM
+  // modulation
+#define THR_64QAM_2 (2528)
+
+  // The log likelihood ratio of a bit being received as 1 is directly
+  // proportional to the modulated symbol received
+  const int16_t weight = (1 << 15) / ulp;
+  const Vec_i16 weight_v = hn::Set(di16, weight);
+
+  const Vec_i16 thr_1 = hn::Set(di16, THR_64QAM_1);
+  const Vec_i16 thr_2 = hn::Set(di16, THR_64QAM_2);
+
+  /* Compute 8 complex symbols at a time */
+  uint32_t blk_cnt = n_symbols / hn::Lanes(di16);
+  while (blk_cnt > 0U) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+    Vec_i16 rec_b = hn::LoadU(di16, (const int16_t *)p_src + hn::Lanes(di16));
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8 llr8_1b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thr_1, hn::Abs(rec_a));
+    rec_b = hn::Sub(thr_1, hn::Abs(rec_b));
+
+    Vec_i8 llr8_2b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c4/r) and L(c5/r) */
+    rec_a = hn::Sub(thr_2, hn::Abs(rec_a));
+    rec_b = hn::Sub(thr_2, hn::Abs(rec_b));
+
+    Vec_i8 llr8_3b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved3(hn::BitCast(du16, llr8_1b),
+                          hn::BitCast(du16, llr8_2b),
+                          hn::BitCast(du16, llr8_3b), du16, (uint16_t *)p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16);
+    p_dst += 3 * hn::Lanes(di8);
+    blk_cnt--;
+  }
+
+  uint32_t tail_cnt = n_symbols & (hn::Lanes(di16) - 1);
+
+  /* Compute 4 complex symbols at a time */
+  if (tail_cnt >= hn::Lanes(di16_half)) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thr_1, hn::Abs(rec_a));
+    Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c4/r) and L(c5/r) */
+    rec_a = hn::Sub(thr_2, hn::Abs(rec_a));
+    Vec_i8_half llr8_3a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved3(
+        hn::BitCast(du16_half, llr8_1a), hn::BitCast(du16_half, llr8_2a),
+        hn::BitCast(du16_half, llr8_3a), du16_half, (uint16_t *)p_dst);
+
+    // Twice this number of int16 but count is for armral_cmplx_int16_t
+    p_src += hn::Lanes(di16_half);
+    p_dst += 3 * hn::Lanes(di8_half);
+    tail_cnt -= hn::Lanes(di16_half);
+  }
+
+  while (tail_cnt > 0U) {
+    // L(c0/r) and L(c1/r)
+    *p_dst++ = sat_mul_double(p_src->re, weight);
+    *p_dst++ = sat_mul_double(p_src->im, weight);
+
+    // L(c2/r) and L(c3/r)
+    int16_t tmp_1a = THR_64QAM_1 - abs(p_src->re);
+    int16_t tmp_1b = THR_64QAM_1 - abs(p_src->im);
+    *p_dst++ = sat_mul_double(tmp_1a, weight);
+    *p_dst++ = sat_mul_double(tmp_1b, weight);
+
+    // L(c4/r) and L(c5/r)
+    *p_dst++ = sat_mul_double(THR_64QAM_2 - abs(tmp_1a), weight);
+    *p_dst++ = sat_mul_double(THR_64QAM_2 - abs(tmp_1b), weight);
+
+    p_src++;
+    tail_cnt--;
+  }
+  return ARMRAL_SUCCESS;
+}
+
+/*
+ * Inline function for 256QAM demodulation
+ * @par       LLRs calculation for 256QAM
+              LLRs calculation for 256QAM, having received a complex symbol rx =
+ rx_re + j*rx_im. The LLRs calculations are made approximately with thresholds
+ method, to have a complexity of O(m). 256QAM Gray mapping following 3GPP
+ TS 38.211 V15.2.0, Ch. 5.1	Modulation mapper Bits position[c0 c1 c2 c3 c4
+ c5 c6 c7]
+
+    LLR(c0|r) = weight * [-rx_re]
+    LLR(c1|r) = weight * [-rx_im]
+    LLR(c2|r) = weight * [|rx_re| - 8/sqrt(170)]
+    LLR(c3|r) = weight * [|rx_im| - 8/sqrt(170)]
+    LLR(c4|r) = weight * [||rx_re| - 8/sqrt(170)| - 4/sqrt(170)]
+    LLR(c5|r) = weight * [||rx_im| - 8/sqrt(170)| - 4/sqrt(170)]
+    LLR(c6|r) = weight * [|||rx_re| - 8/sqrt(170)| - 4/sqrt(170)| - 2/sqrt(170)]
+    LLR(c7|r) = weight * [|||rx_im| - 8/sqrt(170)| - 4/sqrt(170)| - 2/sqrt(170)]
+  *
+*/
+HWY_FORCED_INLINE armral_status
+armral_demodulation_256qam(const uint32_t n_symbols, const uint16_t ulp,
+                           const armral_cmplx_int16_t *p_src, int8_t *p_dst) {
+  // THR_256QAM_1 = 8/sqrt(170) [Q(2.13) format], due to 3GPP Gray Mapping for
+  // 256QAM modulation
+#define THR_256QAM_1 (5026)
+  // THR_256QAM_2 = 4/sqrt(170) [Q(2.13) format], due to 3GPP Gray Mapping for
+  // 256QAM modulation
+#define THR_256QAM_2 (2513)
+  // THR_256QAM_3 = 2/sqrt(170) [Q(2.13) format], due to 3GPP Gray Mapping for
+  // 256QAM modulation
+#define THR_256QAM_3 (1257)
+
+  // The log likelihood ratio of a bit being received as 1 is directly
+  // proportional to the modulated symbol received
+  const int16_t weight = (1 << 15) / ulp;
+  const Vec_i16 weight_v = hn::Set(di16, weight);
+
+  const Vec_i16 thr_1 = hn::Set(di16, THR_256QAM_1);
+  const Vec_i16 thr_2 = hn::Set(di16, THR_256QAM_2);
+  const Vec_i16 thr_3 = hn::Set(di16, THR_256QAM_3);
+
+  /* Compute 8 complex symbols at a time */
+  uint32_t blk_cnt = n_symbols / hn::Lanes(di16);
+  while (blk_cnt > 0U) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+    Vec_i16 rec_b = hn::LoadU(di16, (const int16_t *)p_src + hn::Lanes(di16));
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8 llr8_1b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thr_1, hn::Abs(rec_a));
+    rec_b = hn::Sub(thr_1, hn::Abs(rec_b));
+
+    Vec_i8 llr8_2b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c4/r) and L(c5/r) */
+    rec_a = hn::Sub(thr_2, hn::Abs(rec_a));
+    rec_b = hn::Sub(thr_2, hn::Abs(rec_b));
+
+    Vec_i8 llr8_3b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Computing L(c6/r) and L(c7/r) */
+    rec_a = hn::Sub(thr_3, hn::Abs(rec_a));
+    rec_b = hn::Sub(thr_3, hn::Abs(rec_b));
+
+    Vec_i8 llr8_4b = generate_partial_llrs(rec_a, rec_b, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved4(hn::BitCast(du16, llr8_1b),
+                          hn::BitCast(du16, llr8_2b),
+                          hn::BitCast(du16, llr8_3b),
+                          hn::BitCast(du16, llr8_4b), du16, (uint16_t *)p_dst);
+
+    p_src += hn::Lanes(di16);
+    p_dst += 4 * hn::Lanes(di8);
+    blk_cnt--;
+  }
+
+  uint32_t tail_cnt = n_symbols & (hn::Lanes(di16) - 1);
+
+  /* Compute 4 complex symbols at a time */
+  if (tail_cnt >= hn::Lanes(di16_half)) {
+    Vec_i16 rec_a = hn::LoadU(di16, (const int16_t *)p_src);
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thr_1, hn::Abs(rec_a));
+    Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c4/r) and L(c5/r) */
+    rec_a = hn::Sub(thr_2, hn::Abs(rec_a));
+    Vec_i8_half llr8_3a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c6/r) and L(c7/r) */
+    rec_a = hn::Sub(thr_3, hn::Abs(rec_a));
+    Vec_i8_half llr8_4a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    hn::StoreInterleaved4(
+        hn::BitCast(du16_half, llr8_1a), hn::BitCast(du16_half, llr8_2a),
+        hn::BitCast(du16_half, llr8_3a), hn::BitCast(du16_half, llr8_4a),
+        du16_half, (uint16_t *)p_dst);
+
+    p_src += hn::Lanes(di16_half);
+    p_dst += 4 * hn::Lanes(di8_half);
+    tail_cnt -= hn::Lanes(di16_half);
+  }
+
+  if (tail_cnt > 0U) {
+    size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
+    Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
+    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+
+    /* Computing L(c0/r) and L(c1/r) */
+    Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c2/r) and L(c3/r) */
+    rec_a = hn::Sub(thr_1, hn::Abs(rec_a));
+    Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c4/r) and L(c5/r) */
+    rec_a = hn::Sub(thr_2, hn::Abs(rec_a));
+    Vec_i8_half llr8_3a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Computing L(c6/r) and L(c7/r) */
+    rec_a = hn::Sub(thr_3, hn::Abs(rec_a));
+    Vec_i8_half llr8_4a = generate_partial_llrs_half_vect(rec_a, weight_v);
+
+    /* Store results for consecutive sets of symbols */
+    Vec_i8 llr8_1a_full = hn::ZeroExtendVector(di8, llr8_1a);
+    Vec_i8 llr8_2a_full = hn::ZeroExtendVector(di8, llr8_2a);
+    Vec_i8 llr8_3a_full = hn::ZeroExtendVector(di8, llr8_3a);
+    Vec_i8 llr8_4a_full = hn::ZeroExtendVector(di8, llr8_4a);
+
+    Vec_u16 llr8_even = hn::InterleaveWholeLower(
+        hn::BitCast(du16, llr8_1a_full), hn::BitCast(du16, llr8_3a_full));
+    Vec_u16 llr8_odd = hn::InterleaveWholeLower(
+        hn::BitCast(du16, llr8_2a_full), hn::BitCast(du16, llr8_4a_full));
+
+    Vec_u16 llr8_lower = hn::InterleaveWholeLower(llr8_even, llr8_odd);
+    Vec_u16 llr8_upper = hn::InterleaveWholeUpper(du16, llr8_even, llr8_odd);
+    size_t save_lanes = iteration_symbols * 2;
+    hn::StoreN(llr8_lower, du16, (uint16_t *)p_dst, save_lanes);
+    if (save_lanes > hn::Lanes(du16)) {
+      hn::StoreN(llr8_upper, du16, (uint16_t *)p_dst + hn::Lanes(du16),
+                 save_lanes - hn::Lanes(du16));
+    }
+
+    // // Twice this number of int16 but count is for armral_cmplx_int16_t
+    // p_src += iteration_symbols / 2;
+    // p_dst += 4 * iteration_symbols;
+    // tail_cnt -= iteration_symbols / 2;
+  }
+  return ARMRAL_SUCCESS;
+}
+
+armral_status armral_demodulation(const uint32_t n_symbols, const uint16_t ulp,
+                                  armral_modulation_type mod_type,
+                                  const armral_cmplx_int16_t *p_src,
+                                  int8_t *p_dst) {
+  // If we don't set the return type, it's because the modType isn't recognized.
+  // Therefore, we have an argument error by default.
+  armral_status ret = ARMRAL_ARGUMENT_ERROR;
+  switch (mod_type) {
+  case ARMRAL_MOD_QPSK:
+    ret = armral_demodulation_qpsk(n_symbols, ulp, p_src, p_dst);
+    break;
+  case ARMRAL_MOD_16QAM:
+    ret = armral_demodulation_16qam(n_symbols, ulp, p_src, p_dst);
+    break;
+  case ARMRAL_MOD_64QAM:
+    ret = armral_demodulation_64qam(n_symbols, ulp, p_src, p_dst);
+    break;
+  case ARMRAL_MOD_256QAM:
+    ret = armral_demodulation_256qam(n_symbols, ulp, p_src, p_dst);
+    break;
+  }
+  return ret;
+}
diff --git a/src/UpperPHY/Modulation/highway/arm_modulation.cpp b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
new file mode 100644
index 0000000..c225621
--- /dev/null
+++ b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
@@ -0,0 +1,766 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#include "armral.h"
+#include <string.h>
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+/* Definition of the constellation map according to 3GPP specification.
+ * Gray encoding is used and
+ * 0x16A1 = sqrt(2)/2 in Q2.13
+ */
+static const armral_cmplx_int16_t constellation_qpsk[4] = {
+    {0x16A1, 0x16A1}, {0x16A1, -0x16A1}, {-0x16A1, 0x16A1}, {-0x16A1, -0x16A1}};
+
+void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
+                            armral_cmplx_int16_t *p_dst) {
+  /* Compute the number of blocks of 2 bits in the new tail */
+  uint32_t final_blck = (nbits >> 1U) & 3;
+
+  const size_t n_lanes8 = hn::Lanes(di8);
+  const size_t n_lanes16 = hn::Lanes(di16);
+  const size_t n_lanes32 = hn::Lanes(di32);
+  const size_t n_lanes64 = hn::Lanes(di64);
+
+  const Vec_i16 svqpsk_pos = hn::Set(di16, 0x16A1);
+  const Vec_i16 svqpsk_neg = hn::Set(di16, -0x16A1);
+  // shuffle to map flip of bit order within bytes of input (there is no
+  // instruction for doing this on predicates directly, so we do it on the
+  // result instead).
+  Vec_u64 shuf0_64 = hn::MulAdd(hn::Iota(du64, 0), hn::Set(du64, 0x0808080808080808ULL), hn::Set(du64, 0x0001020304050607ULL));
+  Vec_i16 shuf0 = hn::BitCast(di16, hn::InterleaveWholeLower(hn::BitCast(du8, shuf0_64), hn::Set(du8, 0)));
+  while (nbits > n_lanes8) {
+    // load predicate as one bit per 8-bit element, then unpack into one bit
+    // per 16-bit element and use to select result.
+#if HWY_TARGET & (HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+    // SVE can load mask bits very fast by casting uint8 arrays to svbool
+    Mask_u8 in = *(const svbool_t *)p_src;
+#else
+    Mask_u8 in = hn::LoadMaskBits(du8, p_src);
+#endif
+    Mask_i16 in_lo = hn::PromoteMaskTo(di16, di16_du8, hn::LowerHalfOfMask(di16_du8, in));
+#if HWY_TARGET & (HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+    Mask_i16 in_hi = svunpkhi_b(in);
+#else
+    Mask_i16 in_hi = hn::PromoteMaskTo(di16, di16_du8, hn::UpperHalfOfMask(di16_du8, in));
+#endif
+    p_src += n_lanes64;
+    Vec_i16 vals_lo = hn::IfThenElse(in_lo, svqpsk_neg, svqpsk_pos);
+    Vec_i16 vals_hi = hn::IfThenElse(in_hi, svqpsk_neg, svqpsk_pos);
+    Vec_i16 out_lo = hn::TableLookupLanes(vals_lo, hn::IndicesFromVec(di16, shuf0));
+    Vec_i16 out_hi = hn::TableLookupLanes(vals_hi, hn::IndicesFromVec(di16, shuf0));
+    hn::StoreU(out_lo, di16, (int16_t *)p_dst);
+    hn::StoreU(out_hi, di16, ((int16_t *)p_dst) + n_lanes16);
+    p_dst += n_lanes16;
+    nbits -= n_lanes8;
+  }
+
+  const uint32_t bytes = nbits >> 3U;
+  const uint32_t vl = n_lanes32 / 4;
+  const uint32_t unrolls = bytes / vl;
+  const hn::Mask<decltype(du16_du8)> pred = hn::FirstN(du16_du8,  (int32_t)vl);
+  const Vec_u16 linear_series = hn::Iota(du16, 0);
+  const Vec_u16 mask = hn::Dup128VecFromValues(du16, 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+  Vec_u16 index = hn::InterleaveWholeLower(du16, linear_series, linear_series);
+  index = hn::InterleaveWholeLower(du16, index, index);
+  index = hn::InterleaveWholeLower(du16, index, index);
+  for (uint32_t i = 0; i < unrolls; i++) {
+    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoadU(du16_du8, pred, p_src));
+    p_src += vl;
+    Vec_u16 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
+    Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
+    Vec_i16 points = hn::IfThenElse(mask_pred, svqpsk_neg, svqpsk_pos);
+    hn::StoreU(points, di16, (int16_t *)p_dst);
+    p_dst += n_lanes32;
+  }
+
+  const int32_t leftover_bytes = bytes % vl;
+  const hn::Mask<decltype(du16_du8)> load_lanes = hn::FirstN(du16_du8,  leftover_bytes);
+  const uint32_t active_store_lanes = leftover_bytes * 8;
+  if (leftover_bytes != 0) {
+    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoadU(du16_du8, load_lanes, p_src));
+    p_src += leftover_bytes;
+    Vec_u16 tbl = hn::TableLookupLanes(src_bytes,  hn::IndicesFromVec(du16, index));
+    Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
+    Vec_i16 points = hn::IfThenElse(mask_pred, svqpsk_neg, svqpsk_pos);
+    hn::StoreN(points, di16, (int16_t *)p_dst, leftover_bytes * 8);
+    p_dst += active_store_lanes / 2;
+  }
+
+  /* There might be a tail having less than 8 bits */
+  if (final_blck != 0U) {
+    uint8_t sample = *p_src;
+    uint8_t index0 = sample >> 6;
+    *p_dst++ = constellation_qpsk[index0];
+    final_blck--;
+
+    /* Another possible sample on 2 bits might be present */
+    if (final_blck != 0U) {
+      uint8_t index1 = (sample >> 4) & 0x3;
+      *p_dst++ = constellation_qpsk[index1];
+      final_blck--;
+
+      /* The very last sample on 2 bits, if needed */
+      if (final_blck != 0U) {
+        uint8_t index2 = (sample >> 2) & 0x3;
+        *p_dst++ = constellation_qpsk[index2];
+      }
+    }
+  }
+}
+
+/* Definition of the constellation map according to 3GPP specification.
+ * Gray encoding is used and
+ * 0xA1F  = 1 * sqrt(10)/10 in Q2.13
+ * 0x1E5C = 3 * sqrt(10)/10 in Q2.13
+ */
+static const armral_cmplx_int16_t constellation_16qam[16] = {
+    {0xA1F, 0xA1F},   {0xA1F, 0x1E5C},   {0x1E5C, 0xA1F},   {0x1E5C, 0x1E5C},
+    {0xA1F, -0xA1F},  {0xA1F, -0x1E5C},  {0x1E5C, -0xA1F},  {0x1E5C, -0x1E5C},
+    {-0xA1F, 0xA1F},  {-0xA1F, 0x1E5C},  {-0x1E5C, 0xA1F},  {-0x1E5C, 0x1E5C},
+    {-0xA1F, -0xA1F}, {-0xA1F, -0x1E5C}, {-0x1E5C, -0xA1F}, {-0x1E5C, -0x1E5C}};
+
+/* An outer-product version to compute 2 symbols at a time using a single 8-bit
+ * lookup. */
+static const armral_cmplx_int16_t constellation_16qam_outer_prod[256][2] = {
+    {{0XA1F, 0XA1F}, {0XA1F, 0XA1F}},
+    {{0XA1F, 0XA1F}, {0XA1F, 0X1E5C}},
+    {{0XA1F, 0XA1F}, {0X1E5C, 0XA1F}},
+    {{0XA1F, 0XA1F}, {0X1E5C, 0X1E5C}},
+    {{0XA1F, 0XA1F}, {0XA1F, -0XA1F}},
+    {{0XA1F, 0XA1F}, {0XA1F, -0X1E5C}},
+    {{0XA1F, 0XA1F}, {0X1E5C, -0XA1F}},
+    {{0XA1F, 0XA1F}, {0X1E5C, -0X1E5C}},
+    {{0XA1F, 0XA1F}, {-0XA1F, 0XA1F}},
+    {{0XA1F, 0XA1F}, {-0XA1F, 0X1E5C}},
+    {{0XA1F, 0XA1F}, {-0X1E5C, 0XA1F}},
+    {{0XA1F, 0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{0XA1F, 0XA1F}, {-0XA1F, -0XA1F}},
+    {{0XA1F, 0XA1F}, {-0XA1F, -0X1E5C}},
+    {{0XA1F, 0XA1F}, {-0X1E5C, -0XA1F}},
+    {{0XA1F, 0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{0XA1F, 0X1E5C}, {0XA1F, 0XA1F}},
+    {{0XA1F, 0X1E5C}, {0XA1F, 0X1E5C}},
+    {{0XA1F, 0X1E5C}, {0X1E5C, 0XA1F}},
+    {{0XA1F, 0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{0XA1F, 0X1E5C}, {0XA1F, -0XA1F}},
+    {{0XA1F, 0X1E5C}, {0XA1F, -0X1E5C}},
+    {{0XA1F, 0X1E5C}, {0X1E5C, -0XA1F}},
+    {{0XA1F, 0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{0XA1F, 0X1E5C}, {-0XA1F, 0XA1F}},
+    {{0XA1F, 0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{0XA1F, 0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{0XA1F, 0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{0XA1F, 0X1E5C}, {-0XA1F, -0XA1F}},
+    {{0XA1F, 0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{0XA1F, 0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{0XA1F, 0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{0X1E5C, 0XA1F}, {0XA1F, 0XA1F}},
+    {{0X1E5C, 0XA1F}, {0XA1F, 0X1E5C}},
+    {{0X1E5C, 0XA1F}, {0X1E5C, 0XA1F}},
+    {{0X1E5C, 0XA1F}, {0X1E5C, 0X1E5C}},
+    {{0X1E5C, 0XA1F}, {0XA1F, -0XA1F}},
+    {{0X1E5C, 0XA1F}, {0XA1F, -0X1E5C}},
+    {{0X1E5C, 0XA1F}, {0X1E5C, -0XA1F}},
+    {{0X1E5C, 0XA1F}, {0X1E5C, -0X1E5C}},
+    {{0X1E5C, 0XA1F}, {-0XA1F, 0XA1F}},
+    {{0X1E5C, 0XA1F}, {-0XA1F, 0X1E5C}},
+    {{0X1E5C, 0XA1F}, {-0X1E5C, 0XA1F}},
+    {{0X1E5C, 0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{0X1E5C, 0XA1F}, {-0XA1F, -0XA1F}},
+    {{0X1E5C, 0XA1F}, {-0XA1F, -0X1E5C}},
+    {{0X1E5C, 0XA1F}, {-0X1E5C, -0XA1F}},
+    {{0X1E5C, 0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {0XA1F, 0XA1F}},
+    {{0X1E5C, 0X1E5C}, {0XA1F, 0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {0X1E5C, 0XA1F}},
+    {{0X1E5C, 0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {0XA1F, -0XA1F}},
+    {{0X1E5C, 0X1E5C}, {0XA1F, -0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {0X1E5C, -0XA1F}},
+    {{0X1E5C, 0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {-0XA1F, 0XA1F}},
+    {{0X1E5C, 0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{0X1E5C, 0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {-0XA1F, -0XA1F}},
+    {{0X1E5C, 0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{0X1E5C, 0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{0X1E5C, 0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{0XA1F, -0XA1F}, {0XA1F, 0XA1F}},
+    {{0XA1F, -0XA1F}, {0XA1F, 0X1E5C}},
+    {{0XA1F, -0XA1F}, {0X1E5C, 0XA1F}},
+    {{0XA1F, -0XA1F}, {0X1E5C, 0X1E5C}},
+    {{0XA1F, -0XA1F}, {0XA1F, -0XA1F}},
+    {{0XA1F, -0XA1F}, {0XA1F, -0X1E5C}},
+    {{0XA1F, -0XA1F}, {0X1E5C, -0XA1F}},
+    {{0XA1F, -0XA1F}, {0X1E5C, -0X1E5C}},
+    {{0XA1F, -0XA1F}, {-0XA1F, 0XA1F}},
+    {{0XA1F, -0XA1F}, {-0XA1F, 0X1E5C}},
+    {{0XA1F, -0XA1F}, {-0X1E5C, 0XA1F}},
+    {{0XA1F, -0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{0XA1F, -0XA1F}, {-0XA1F, -0XA1F}},
+    {{0XA1F, -0XA1F}, {-0XA1F, -0X1E5C}},
+    {{0XA1F, -0XA1F}, {-0X1E5C, -0XA1F}},
+    {{0XA1F, -0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{0XA1F, -0X1E5C}, {0XA1F, 0XA1F}},
+    {{0XA1F, -0X1E5C}, {0XA1F, 0X1E5C}},
+    {{0XA1F, -0X1E5C}, {0X1E5C, 0XA1F}},
+    {{0XA1F, -0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{0XA1F, -0X1E5C}, {0XA1F, -0XA1F}},
+    {{0XA1F, -0X1E5C}, {0XA1F, -0X1E5C}},
+    {{0XA1F, -0X1E5C}, {0X1E5C, -0XA1F}},
+    {{0XA1F, -0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{0XA1F, -0X1E5C}, {-0XA1F, 0XA1F}},
+    {{0XA1F, -0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{0XA1F, -0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{0XA1F, -0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{0XA1F, -0X1E5C}, {-0XA1F, -0XA1F}},
+    {{0XA1F, -0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{0XA1F, -0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{0XA1F, -0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{0X1E5C, -0XA1F}, {0XA1F, 0XA1F}},
+    {{0X1E5C, -0XA1F}, {0XA1F, 0X1E5C}},
+    {{0X1E5C, -0XA1F}, {0X1E5C, 0XA1F}},
+    {{0X1E5C, -0XA1F}, {0X1E5C, 0X1E5C}},
+    {{0X1E5C, -0XA1F}, {0XA1F, -0XA1F}},
+    {{0X1E5C, -0XA1F}, {0XA1F, -0X1E5C}},
+    {{0X1E5C, -0XA1F}, {0X1E5C, -0XA1F}},
+    {{0X1E5C, -0XA1F}, {0X1E5C, -0X1E5C}},
+    {{0X1E5C, -0XA1F}, {-0XA1F, 0XA1F}},
+    {{0X1E5C, -0XA1F}, {-0XA1F, 0X1E5C}},
+    {{0X1E5C, -0XA1F}, {-0X1E5C, 0XA1F}},
+    {{0X1E5C, -0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{0X1E5C, -0XA1F}, {-0XA1F, -0XA1F}},
+    {{0X1E5C, -0XA1F}, {-0XA1F, -0X1E5C}},
+    {{0X1E5C, -0XA1F}, {-0X1E5C, -0XA1F}},
+    {{0X1E5C, -0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {0XA1F, 0XA1F}},
+    {{0X1E5C, -0X1E5C}, {0XA1F, 0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {0X1E5C, 0XA1F}},
+    {{0X1E5C, -0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {0XA1F, -0XA1F}},
+    {{0X1E5C, -0X1E5C}, {0XA1F, -0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {0X1E5C, -0XA1F}},
+    {{0X1E5C, -0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {-0XA1F, 0XA1F}},
+    {{0X1E5C, -0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{0X1E5C, -0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {-0XA1F, -0XA1F}},
+    {{0X1E5C, -0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{0X1E5C, -0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{0X1E5C, -0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{-0XA1F, 0XA1F}, {0XA1F, 0XA1F}},
+    {{-0XA1F, 0XA1F}, {0XA1F, 0X1E5C}},
+    {{-0XA1F, 0XA1F}, {0X1E5C, 0XA1F}},
+    {{-0XA1F, 0XA1F}, {0X1E5C, 0X1E5C}},
+    {{-0XA1F, 0XA1F}, {0XA1F, -0XA1F}},
+    {{-0XA1F, 0XA1F}, {0XA1F, -0X1E5C}},
+    {{-0XA1F, 0XA1F}, {0X1E5C, -0XA1F}},
+    {{-0XA1F, 0XA1F}, {0X1E5C, -0X1E5C}},
+    {{-0XA1F, 0XA1F}, {-0XA1F, 0XA1F}},
+    {{-0XA1F, 0XA1F}, {-0XA1F, 0X1E5C}},
+    {{-0XA1F, 0XA1F}, {-0X1E5C, 0XA1F}},
+    {{-0XA1F, 0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{-0XA1F, 0XA1F}, {-0XA1F, -0XA1F}},
+    {{-0XA1F, 0XA1F}, {-0XA1F, -0X1E5C}},
+    {{-0XA1F, 0XA1F}, {-0X1E5C, -0XA1F}},
+    {{-0XA1F, 0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {0XA1F, 0XA1F}},
+    {{-0XA1F, 0X1E5C}, {0XA1F, 0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {0X1E5C, 0XA1F}},
+    {{-0XA1F, 0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {0XA1F, -0XA1F}},
+    {{-0XA1F, 0X1E5C}, {0XA1F, -0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {0X1E5C, -0XA1F}},
+    {{-0XA1F, 0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {-0XA1F, 0XA1F}},
+    {{-0XA1F, 0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{-0XA1F, 0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {-0XA1F, -0XA1F}},
+    {{-0XA1F, 0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{-0XA1F, 0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{-0XA1F, 0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {0XA1F, 0XA1F}},
+    {{-0X1E5C, 0XA1F}, {0XA1F, 0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {0X1E5C, 0XA1F}},
+    {{-0X1E5C, 0XA1F}, {0X1E5C, 0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {0XA1F, -0XA1F}},
+    {{-0X1E5C, 0XA1F}, {0XA1F, -0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {0X1E5C, -0XA1F}},
+    {{-0X1E5C, 0XA1F}, {0X1E5C, -0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {-0XA1F, 0XA1F}},
+    {{-0X1E5C, 0XA1F}, {-0XA1F, 0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {-0X1E5C, 0XA1F}},
+    {{-0X1E5C, 0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {-0XA1F, -0XA1F}},
+    {{-0X1E5C, 0XA1F}, {-0XA1F, -0X1E5C}},
+    {{-0X1E5C, 0XA1F}, {-0X1E5C, -0XA1F}},
+    {{-0X1E5C, 0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {0XA1F, 0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {0XA1F, 0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {0X1E5C, 0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {0XA1F, -0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {0XA1F, -0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {0X1E5C, -0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {-0XA1F, 0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {-0XA1F, -0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{-0X1E5C, 0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{-0X1E5C, 0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{-0XA1F, -0XA1F}, {0XA1F, 0XA1F}},
+    {{-0XA1F, -0XA1F}, {0XA1F, 0X1E5C}},
+    {{-0XA1F, -0XA1F}, {0X1E5C, 0XA1F}},
+    {{-0XA1F, -0XA1F}, {0X1E5C, 0X1E5C}},
+    {{-0XA1F, -0XA1F}, {0XA1F, -0XA1F}},
+    {{-0XA1F, -0XA1F}, {0XA1F, -0X1E5C}},
+    {{-0XA1F, -0XA1F}, {0X1E5C, -0XA1F}},
+    {{-0XA1F, -0XA1F}, {0X1E5C, -0X1E5C}},
+    {{-0XA1F, -0XA1F}, {-0XA1F, 0XA1F}},
+    {{-0XA1F, -0XA1F}, {-0XA1F, 0X1E5C}},
+    {{-0XA1F, -0XA1F}, {-0X1E5C, 0XA1F}},
+    {{-0XA1F, -0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{-0XA1F, -0XA1F}, {-0XA1F, -0XA1F}},
+    {{-0XA1F, -0XA1F}, {-0XA1F, -0X1E5C}},
+    {{-0XA1F, -0XA1F}, {-0X1E5C, -0XA1F}},
+    {{-0XA1F, -0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {0XA1F, 0XA1F}},
+    {{-0XA1F, -0X1E5C}, {0XA1F, 0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {0X1E5C, 0XA1F}},
+    {{-0XA1F, -0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {0XA1F, -0XA1F}},
+    {{-0XA1F, -0X1E5C}, {0XA1F, -0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {0X1E5C, -0XA1F}},
+    {{-0XA1F, -0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {-0XA1F, 0XA1F}},
+    {{-0XA1F, -0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{-0XA1F, -0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {-0XA1F, -0XA1F}},
+    {{-0XA1F, -0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{-0XA1F, -0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{-0XA1F, -0X1E5C}, {-0X1E5C, -0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {0XA1F, 0XA1F}},
+    {{-0X1E5C, -0XA1F}, {0XA1F, 0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {0X1E5C, 0XA1F}},
+    {{-0X1E5C, -0XA1F}, {0X1E5C, 0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {0XA1F, -0XA1F}},
+    {{-0X1E5C, -0XA1F}, {0XA1F, -0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {0X1E5C, -0XA1F}},
+    {{-0X1E5C, -0XA1F}, {0X1E5C, -0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {-0XA1F, 0XA1F}},
+    {{-0X1E5C, -0XA1F}, {-0XA1F, 0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {-0X1E5C, 0XA1F}},
+    {{-0X1E5C, -0XA1F}, {-0X1E5C, 0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {-0XA1F, -0XA1F}},
+    {{-0X1E5C, -0XA1F}, {-0XA1F, -0X1E5C}},
+    {{-0X1E5C, -0XA1F}, {-0X1E5C, -0XA1F}},
+    {{-0X1E5C, -0XA1F}, {-0X1E5C, -0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {0XA1F, 0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {0XA1F, 0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {0X1E5C, 0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {0X1E5C, 0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {0XA1F, -0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {0XA1F, -0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {0X1E5C, -0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {0X1E5C, -0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {-0XA1F, 0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {-0XA1F, 0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {-0X1E5C, 0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {-0X1E5C, 0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {-0XA1F, -0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {-0XA1F, -0X1E5C}},
+    {{-0X1E5C, -0X1E5C}, {-0X1E5C, -0XA1F}},
+    {{-0X1E5C, -0X1E5C}, {-0X1E5C, -0X1E5C}}};
+
+void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
+                             armral_cmplx_int16_t *p_dst) {
+
+  /* Compute the number of bytes */
+  uint32_t bytes = nbits >> 3U;
+
+  /* Finally get the tail (in bits) */
+  uint32_t tail = nbits - (bytes * 8);
+  /* Compute the number of blocks on 4 bits in the tail */
+  uint32_t final_blck = tail / 4;
+
+  /* Process a possible tail or an input with less than 8 int */
+  uint32_t blk_cnt = bytes;
+
+  uint32_t vl = hn::Lanes(du64);
+  // At 128 bit vectorisation comes out slower than just using memcpy.
+  // With longer vectors, vectorisation should overtake the memcpy method.
+  if (vl > 2) {  // > 128 bit vectors
+    uint32_t unrolls = blk_cnt / vl;
+    for (uint32_t i = 0; i < unrolls; i++) {
+      Vec_i64 svsample = hn::PromoteTo(di64, hn::LoadU(du64_du8, p_src));
+      p_src += vl;
+      Vec_i64 gather = hn::GatherIndex(di64, (const int64_t *)constellation_16qam_outer_prod,
+          svsample);
+      hn::StoreU(gather, di64, (int64_t *)p_dst + i * vl);
+    }
+
+    uint32_t i = unrolls * vl;
+    if (i < blk_cnt) {
+      uint32_t tail_size = blk_cnt - i;
+      Mask_i64 pred_i64 = hn::FirstN(di64, tail_size);
+      hn::Mask<decltype(du64_du8)> pred_u8 = hn::FirstN(du64_du8, tail_size);
+      Vec_i64 svsample = hn::PromoteTo(di64, no_sanitize::MaskedLoadU(du64_du8, pred_u8, p_src));
+      p_src += blk_cnt - i;
+      Vec_i64 gather = hn::MaskedGatherIndex(pred_i64, di64,
+          (const int64_t *)constellation_16qam_outer_prod, svsample);
+      hn::StoreN(gather, di64, (int64_t *)p_dst + i, tail_size);
+    }
+
+    /* Process the very last sample on 4 bits */
+    if (final_blck != 0U) {
+      uint8_t sample = *p_src;
+
+      /* compute index */
+      uint8_t index = sample >> 4;
+      index &= 0xF;
+
+      p_dst[2 * blk_cnt] = constellation_16qam[index];
+    }
+  } else {
+    while (blk_cnt > 0U) {
+      uint8_t sample = *p_src++;
+      memcpy(p_dst, constellation_16qam_outer_prod + sample,
+            2 * sizeof(armral_cmplx_int16_t));
+      p_dst += 2;
+      blk_cnt--;
+    }
+    /* Process the very last sample on 4 bits */
+    if (final_blck != 0) {
+      uint8_t sample = *p_src;
+      uint8_t mask = 0xF;
+
+      /* compute index */
+      uint8_t index = sample >> 4;
+      index &= mask;
+
+      *p_dst = constellation_16qam[index];
+    }
+  }
+}
+
+/* Definition of the constellation map according to 3GPP specification.
+ * Gray encoding is used and
+ * 0x4F0 = 1 * sqrt(42)/42 in Q2.13
+ * 0xED0 = 3 * sqrt(42)/42 in Q2.13
+ * 0x18B0 = 5 * sqrt(42)/42 in Q2.13
+ * 0x2290 = 7 * sqrt(42)/42 in Q2.13
+ */
+static const armral_cmplx_int16_t constellation_64qam[64] = {
+    {0xED0, 0xED0},     {0xED0, 0x4F0},     {0x4F0, 0xED0},
+    {0x4F0, 0x4F0},     {0xED0, 0x18B0},    {0xED0, 0x2290},
+    {0x4F0, 0x18B0},    {0x4F0, 0x2290},    {0x18B0, 0xED0},
+    {0x18B0, 0x4F0},    {0x2290, 0xED0},    {0x2290, 0x4F0},
+    {0x18B0, 0x18B0},   {0x18B0, 0x2290},   {0x2290, 0x18B0},
+    {0x2290, 0x2290},   {0xED0, -0xED0},    {0xED0, -0x4F0},
+    {0x4F0, -0xED0},    {0x4F0, -0x4F0},    {0xED0, -0x18B0},
+    {0xED0, -0x2290},   {0x4F0, -0x18B0},   {0x4F0, -0x2290},
+    {0x18B0, -0xED0},   {0x18B0, -0x4F0},   {0x2290, -0xED0},
+    {0x2290, -0x4F0},   {0x18B0, -0x18B0},  {0x18B0, -0x2290},
+    {0x2290, -0x18B0},  {0x2290, -0x2290},  {-0xED0, 0xED0},
+    {-0xED0, 0x4F0},    {-0x4F0, 0xED0},    {-0x4F0, 0x4F0},
+    {-0xED0, 0x18B0},   {-0xED0, 0x2290},   {-0x4F0, 0x18B0},
+    {-0x4F0, 0x2290},   {-0x18B0, 0xED0},   {-0x18B0, 0x4F0},
+    {-0x2290, 0xED0},   {-0x2290, 0x4F0},   {-0x18B0, 0x18B0},
+    {-0x18B0, 0x2290},  {-0x2290, 0x18B0},  {-0x2290, 0x2290},
+    {-0xED0, -0xED0},   {-0xED0, -0x4F0},   {-0x4F0, -0xED0},
+    {-0x4F0, -0x4F0},   {-0xED0, -0x18B0},  {-0xED0, -0x2290},
+    {-0x4F0, -0x18B0},  {-0x4F0, -0x2290},  {-0x18B0, -0xED0},
+    {-0x18B0, -0x4F0},  {-0x2290, -0xED0},  {-0x2290, -0x4F0},
+    {-0x18B0, -0x18B0}, {-0x18B0, -0x2290}, {-0x2290, -0x18B0},
+    {-0x2290, -0x2290}};
+
+void armral_64qam_modulation(const uint32_t nbits, const uint8_t *p_src,
+                             armral_cmplx_int16_t *p_dst) {
+
+  /* Compute the number of bytes */
+  uint32_t bytes = nbits >> 3U;
+
+  /* Compute the blocks which will be processed using loop unroll */
+  uint32_t unr_cnt = bytes / 3;
+  /* Compute the number of blocks on 6 bits in the tail */
+  // This implementation performs computation
+  // on 24 bytes at a time, per 128 vector.
+  // This follows a similar approach to the SIMD
+  // version but requires exposition on how we make
+  // this possible with SVE. The logic follows
+  // that we load 32 bits of data, but mask against
+  // the 24 bits we wish to compute against. This
+  // is done in two ways, first we make sure that
+  // vl is assigned value equal to how many 24 bit
+  // per 128 bits we'll compute on, second we use a
+  // tbl lookup to load our bytes into the correct lane
+  // positions. The value given to specify index, 0xff000102
+  // means that at every 32 bit lane, will look like the following
+  //         [OUT_OF_RANGE|BYTE0|BYTE1|BYTE2]
+  // where OUT_OF_RANGE corresponds to an out of range index
+  // value, and BYTEN corresponds to the N'th byte of our
+  // 3 bytes (24 bits) read in.
+  const uint32_t vl = (hn::Lanes(di32) * 3) / 4;
+  Mask_u8 pred = hn::FirstN(du8, vl);
+  Vec_u32 index = hn::MulAdd(hn::Iota(du32, 0), hn::Set(du32, 0x00030303), hn::Set(du32, 0x00000102));
+  index = hn::InterleaveWholeLower(index, index);
+  index = hn::InterleaveWholeLower(index, index);
+  Vec_i32 byte_mask = hn::Dup128VecFromValues(di32, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
+  const uint32_t svunroll_cnt = bytes / vl;
+  for (uint32_t i = 0; i < svunroll_cnt; i++) {
+    Vec_u8 src_bytes = no_sanitize::MaskedLoadU(du8, pred, p_src);
+    Vec_u8 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du8, hn::BitCast(du8, index)));
+    Vec_i32 data = hn::BitCast(di32, tbl);
+    // Mask out the most significant byte of each 32 bit value.
+    // On Arm this can be achieved directly in the table lookup but this is not
+    // portable to other architectures.
+    data = hn::And(data, byte_mask);
+    Vec_i32 shift = hn::Dup128VecFromValues(di32, 18, 12, 6, 0);
+    data = hn::And(hn::Shr(data, shift), hn::Set(di32, 0x3f));
+    p_src += vl;
+    Vec_u32 gather = hn::GatherIndex(du32, (const uint32_t *)constellation_64qam, data);
+    hn::StoreU(gather, du32, (uint32_t *)p_dst);
+    p_dst += hn::Lanes(du32);
+  }
+  const uint32_t leftover_bytes = bytes - (svunroll_cnt * vl);
+  unr_cnt = leftover_bytes / 3;
+  /* Loop unroll processing */
+  while (unr_cnt > 0U) {
+    uint8_t first = *p_src++;
+    uint8_t second = *p_src++;
+    uint8_t third = *p_src++;
+
+    uint8_t index0 = first >> 2;
+    uint8_t index1 = ((first & 0x3) << 4) | (second >> 4);
+    uint8_t index2 = ((second & 0xF) << 2) | (third >> 6);
+    uint8_t index3 = third & 0x3F;
+
+    *p_dst++ = constellation_64qam[index0];
+    *p_dst++ = constellation_64qam[index1];
+    *p_dst++ = constellation_64qam[index2];
+    *p_dst++ = constellation_64qam[index3];
+
+    unr_cnt--;
+  }
+
+  /* Finally get the tail (in bits) */
+  unr_cnt = bytes / 3;
+  uint32_t tail = nbits - (unr_cnt * 3 * 8);
+  uint32_t final_blck = tail / 6;
+  /* Process the tail which might be present */
+  if (final_blck != 0U) {
+    /* There's always at least one byte */
+    uint8_t index0 = *p_src >> 2;
+    armral_cmplx_int16_t point = constellation_64qam[index0];
+
+    *p_dst = point;
+    p_dst++;
+    final_blck--;
+
+    /* Two more samples might be present */
+    if (final_blck != 0U) {
+      uint8_t index1 = *p_src & 0x3;
+      p_src++;
+      index1 = index1 << 4;
+
+      uint8_t scalar_tmp = *p_src & 0xF0;
+      scalar_tmp = scalar_tmp >> 4;
+      index1 = index1 | scalar_tmp;
+
+      point = constellation_64qam[index1];
+      *p_dst = point;
+      p_dst++;
+
+      final_blck--;
+
+      /* The very last sample on 6 bits */
+      if (final_blck != 0U) {
+        uint8_t index2 = *p_src & 0xF;
+        p_src++;
+        index2 = index2 << 2;
+
+        scalar_tmp = *p_src & 0xC0;
+        scalar_tmp = scalar_tmp >> 6;
+        index2 = index2 | scalar_tmp;
+
+        point = constellation_64qam[index2];
+        *p_dst = point;
+      }
+    }
+  }
+}
+
+/* Definition of the constellation map according to 3GPP specification.
+ * Gray encoding is used and
+ * 0x274  = 1 * sqrt(170)/170 in Q2.13
+ * 0x75D  = 3 * sqrt(170)/170 in Q2.13
+ * 0xC45  = 5 * sqrt(170)/170 in Q2.13
+ * 0x112E = 7 * sqrt(170)/170 in Q2.13
+ * 0x1617 = 9 * sqrt(170)/170 in Q2.13
+ * 0x1AFF = 11 * sqrt(170)/170 in Q2.13
+ * 0x1FE8 = 13 * sqrt(170)/170 in Q2.13
+ * 0x24D0 = 15 * sqrt(170)/170 in Q2.13
+ */
+static const armral_cmplx_int16_t constellation_256qam[256] = {
+    {0xC45, 0xC45},     {0xC45, 0x112E},    {0x112E, 0xC45},
+    {0x112E, 0x112E},   {0xC45, 0x75D},     {0xC45, 0x274},
+    {0x112E, 0x75D},    {0x112E, 0x274},    {0x75D, 0xC45},
+    {0x75D, 0x112E},    {0x274, 0xC45},     {0x274, 0x112E},
+    {0x75D, 0x75D},     {0x75D, 0x274},     {0x274, 0x75D},
+    {0x274, 0x274},     {0xC45, 0x1AFF},    {0xC45, 0x1617},
+    {0x112E, 0x1AFF},   {0x112E, 0x1617},   {0xC45, 0x1FE8},
+    {0xC45, 0x24D0},    {0x112E, 0x1FE8},   {0x112E, 0x24D0},
+    {0x75D, 0x1AFF},    {0x75D, 0x1617},    {0x274, 0x1AFF},
+    {0x274, 0x1617},    {0x75D, 0x1FE8},    {0x75D, 0x24D0},
+    {0x274, 0x1FE8},    {0x274, 0x24D0},    {0x1AFF, 0xC45},
+    {0x1AFF, 0x112E},   {0x1617, 0xC45},    {0x1617, 0x112E},
+    {0x1AFF, 0x75D},    {0x1AFF, 0x274},    {0x1617, 0x75D},
+    {0x1617, 0x274},    {0x1FE8, 0xC45},    {0x1FE8, 0x112E},
+    {0x24D0, 0xC45},    {0x24D0, 0x112E},   {0x1FE8, 0x75D},
+    {0x1FE8, 0x274},    {0x24D0, 0x75D},    {0x24D0, 0x274},
+    {0x1AFF, 0x1AFF},   {0x1AFF, 0x1617},   {0x1617, 0x1AFF},
+    {0x1617, 0x1617},   {0x1AFF, 0x1FE8},   {0x1AFF, 0x24D0},
+    {0x1617, 0x1FE8},   {0x1617, 0x24D0},   {0x1FE8, 0x1AFF},
+    {0x1FE8, 0x1617},   {0x24D0, 0x1AFF},   {0x24D0, 0x1617},
+    {0x1FE8, 0x1FE8},   {0x1FE8, 0x24D0},   {0x24D0, 0x1FE8},
+    {0x24D0, 0x24D0},
+
+    {0xC45, -0xC45},    {0xC45, -0x112E},   {0x112E, -0xC45},
+    {0x112E, -0x112E},  {0xC45, -0x75D},    {0xC45, -0x274},
+    {0x112E, -0x75D},   {0x112E, -0x274},   {0x75D, -0xC45},
+    {0x75D, -0x112E},   {0x274, -0xC45},    {0x274, -0x112E},
+    {0x75D, -0x75D},    {0x75D, -0x274},    {0x274, -0x75D},
+    {0x274, -0x274},    {0xC45, -0x1AFF},   {0xC45, -0x1617},
+    {0x112E, -0x1AFF},  {0x112E, -0x1617},  {0xC45, -0x1FE8},
+    {0xC45, -0x24D0},   {0x112E, -0x1FE8},  {0x112E, -0x24D0},
+    {0x75D, -0x1AFF},   {0x75D, -0x1617},   {0x274, -0x1AFF},
+    {0x274, -0x1617},   {0x75D, -0x1FE8},   {0x75D, -0x24D0},
+    {0x274, -0x1FE8},   {0x274, -0x24D0},   {0x1AFF, -0xC45},
+    {0x1AFF, -0x112E},  {0x1617, -0xC45},   {0x1617, -0x112E},
+    {0x1AFF, -0x75D},   {0x1AFF, -0x274},   {0x1617, -0x75D},
+    {0x1617, -0x274},   {0x1FE8, -0xC45},   {0x1FE8, -0x112E},
+    {0x24D0, -0xC45},   {0x24D0, -0x112E},  {0x1FE8, -0x75D},
+    {0x1FE8, -0x274},   {0x24D0, -0x75D},   {0x24D0, -0x274},
+    {0x1AFF, -0x1AFF},  {0x1AFF, -0x1617},  {0x1617, -0x1AFF},
+    {0x1617, -0x1617},  {0x1AFF, -0x1FE8},  {0x1AFF, -0x24D0},
+    {0x1617, -0x1FE8},  {0x1617, -0x24D0},  {0x1FE8, -0x1AFF},
+    {0x1FE8, -0x1617},  {0x24D0, -0x1AFF},  {0x24D0, -0x1617},
+    {0x1FE8, -0x1FE8},  {0x1FE8, -0x24D0},  {0x24D0, -0x1FE8},
+    {0x24D0, -0x24D0},
+
+    {-0xC45, 0xC45},    {-0xC45, 0x112E},   {-0x112E, 0xC45},
+    {-0x112E, 0x112E},  {-0xC45, 0x75D},    {-0xC45, 0x274},
+    {-0x112E, 0x75D},   {-0x112E, 0x274},   {-0x75D, 0xC45},
+    {-0x75D, 0x112E},   {-0x274, 0xC45},    {-0x274, 0x112E},
+    {-0x75D, 0x75D},    {-0x75D, 0x274},    {-0x274, 0x75D},
+    {-0x274, 0x274},    {-0xC45, 0x1AFF},   {-0xC45, 0x1617},
+    {-0x112E, 0x1AFF},  {-0x112E, 0x1617},  {-0xC45, 0x1FE8},
+    {-0xC45, 0x24D0},   {-0x112E, 0x1FE8},  {-0x112E, 0x24D0},
+    {-0x75D, 0x1AFF},   {-0x75D, 0x1617},   {-0x274, 0x1AFF},
+    {-0x274, 0x1617},   {-0x75D, 0x1FE8},   {-0x75D, 0x24D0},
+    {-0x274, 0x1FE8},   {-0x274, 0x24D0},   {-0x1AFF, 0xC45},
+    {-0x1AFF, 0x112E},  {-0x1617, 0xC45},   {-0x1617, 0x112E},
+    {-0x1AFF, 0x75D},   {-0x1AFF, 0x274},   {-0x1617, 0x75D},
+    {-0x1617, 0x274},   {-0x1FE8, 0xC45},   {-0x1FE8, 0x112E},
+    {-0x24D0, 0xC45},   {-0x24D0, 0x112E},  {-0x1FE8, 0x75D},
+    {-0x1FE8, 0x274},   {-0x24D0, 0x75D},   {-0x24D0, 0x274},
+    {-0x1AFF, 0x1AFF},  {-0x1AFF, 0x1617},  {-0x1617, 0x1AFF},
+    {-0x1617, 0x1617},  {-0x1AFF, 0x1FE8},  {-0x1AFF, 0x24D0},
+    {-0x1617, 0x1FE8},  {-0x1617, 0x24D0},  {-0x1FE8, 0x1AFF},
+    {-0x1FE8, 0x1617},  {-0x24D0, 0x1AFF},  {-0x24D0, 0x1617},
+    {-0x1FE8, 0x1FE8},  {-0x1FE8, 0x24D0},  {-0x24D0, 0x1FE8},
+    {-0x24D0, 0x24D0},
+
+    {-0xC45, -0xC45},   {-0xC45, -0x112E},  {-0x112E, -0xC45},
+    {-0x112E, -0x112E}, {-0xC45, -0x75D},   {-0xC45, -0x274},
+    {-0x112E, -0x75D},  {-0x112E, -0x274},  {-0x75D, -0xC45},
+    {-0x75D, -0x112E},  {-0x274, -0xC45},   {-0x274, -0x112E},
+    {-0x75D, -0x75D},   {-0x75D, -0x274},   {-0x274, -0x75D},
+    {-0x274, -0x274},   {-0xC45, -0x1AFF},  {-0xC45, -0x1617},
+    {-0x112E, -0x1AFF}, {-0x112E, -0x1617}, {-0xC45, -0x1FE8},
+    {-0xC45, -0x24D0},  {-0x112E, -0x1FE8}, {-0x112E, -0x24D0},
+    {-0x75D, -0x1AFF},  {-0x75D, -0x1617},  {-0x274, -0x1AFF},
+    {-0x274, -0x1617},  {-0x75D, -0x1FE8},  {-0x75D, -0x24D0},
+    {-0x274, -0x1FE8},  {-0x274, -0x24D0},  {-0x1AFF, -0xC45},
+    {-0x1AFF, -0x112E}, {-0x1617, -0xC45},  {-0x1617, -0x112E},
+    {-0x1AFF, -0x75D},  {-0x1AFF, -0x274},  {-0x1617, -0x75D},
+    {-0x1617, -0x274},  {-0x1FE8, -0xC45},  {-0x1FE8, -0x112E},
+    {-0x24D0, -0xC45},  {-0x24D0, -0x112E}, {-0x1FE8, -0x75D},
+    {-0x1FE8, -0x274},  {-0x24D0, -0x75D},  {-0x24D0, -0x274},
+    {-0x1AFF, -0x1AFF}, {-0x1AFF, -0x1617}, {-0x1617, -0x1AFF},
+    {-0x1617, -0x1617}, {-0x1AFF, -0x1FE8}, {-0x1AFF, -0x24D0},
+    {-0x1617, -0x1FE8}, {-0x1617, -0x24D0}, {-0x1FE8, -0x1AFF},
+    {-0x1FE8, -0x1617}, {-0x24D0, -0x1AFF}, {-0x24D0, -0x1617},
+    {-0x1FE8, -0x1FE8}, {-0x1FE8, -0x24D0}, {-0x24D0, -0x1FE8},
+    {-0x24D0, -0x24D0}};
+
+void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
+                              armral_cmplx_int16_t *p_dst) {
+  /* Compute the number of bytes */
+  uint32_t bytes = nbits >> 3U;
+  uint64_t vl = Lanes(du32);
+  /* Compute the blocks which will be processed using loop unroll */
+  uint32_t unr_cnt = bytes / vl;
+
+  for (uint32_t i = 0; i < unr_cnt; i++) {
+    Vec_i32 index = hn::PromoteTo(di32, hn::LoadU(di32_du8, p_src));
+    p_src += vl;
+    Vec_i32 gather = hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
+    hn::StoreU(gather, di32, (int32_t *)p_dst);
+    p_dst += vl;
+  }
+
+  const uint32_t leftover_bytes = bytes - unr_cnt * vl;
+  if (leftover_bytes != 0U) {
+    hn::Mask<decltype(di32_du8)> pred = hn::FirstN(di32_du8, leftover_bytes);
+    Vec_i32 index = hn::PromoteTo(di32, no_sanitize::MaskedLoadU(di32_du8, pred, p_src));
+    Vec_i32 gather = hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
+    hn::StoreN(gather, di32, (int32_t *)p_dst, leftover_bytes);
+  }
+}
+
+armral_status armral_modulation(const uint32_t nbits,
+                                armral_modulation_type mod_type,
+                                const uint8_t *p_src,
+                                armral_cmplx_int16_t *p_dst) {
+  switch (mod_type) {
+  case ARMRAL_MOD_QPSK:
+    if ((nbits % 2) != 0) {
+      return ARMRAL_ARGUMENT_ERROR;
+    }
+    armral_qpsk_modulation(nbits, p_src, p_dst);
+    return ARMRAL_SUCCESS;
+  case ARMRAL_MOD_16QAM:
+    if ((nbits % 4) != 0) {
+      return ARMRAL_ARGUMENT_ERROR;
+    }
+    armral_16qam_modulation(nbits, p_src, p_dst);
+    return ARMRAL_SUCCESS;
+  case ARMRAL_MOD_64QAM:
+    if ((nbits % 6) != 0) {
+      return ARMRAL_ARGUMENT_ERROR;
+    }
+    armral_64qam_modulation(nbits, p_src, p_dst);
+    return ARMRAL_SUCCESS;
+  case ARMRAL_MOD_256QAM:
+    if ((nbits % 8) != 0) {
+      return ARMRAL_ARGUMENT_ERROR;
+    }
+    armral_256qam_modulation(nbits, p_src, p_dst);
+    return ARMRAL_SUCCESS;
+  }
+  return ARMRAL_ARGUMENT_ERROR;
+}
diff --git a/src/utils/acle/bits_to_bytes.hpp b/src/utils/acle/bits_to_bytes.hpp
new file mode 100644
index 0000000..6769f5c
--- /dev/null
+++ b/src/utils/acle/bits_to_bytes.hpp
@@ -0,0 +1,128 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include "intrinsics.h"
+#include <arm_neon.h>
+#include <vector>
+
+namespace armral {
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead in the passed-in array "out"
+// Data is read from the most significant bit in each byte to the least
+// significant
+inline void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
+  uint32_t full_bytes = n >> 3;
+  // Set the mask
+  uint8x16_t mask = vdupq_n_u8(1);
+  // When shifting, we are going to be dealing with two bytes at a time
+  int8x16_t shifts = {-7, -6, -5, -4, -3, -2, -1, 0,
+                      -7, -6, -5, -4, -3, -2, -1, 0};
+  // Set the base index for the bytes to 0 x 8, 1 x 8
+  uint64x2_t base_index = {0x0, 0x0101010101010101};
+  // Increment the index by two each iteration
+  uint8x16_t two = vdupq_n_u8(2);
+  uint32_t i = 0;
+  for (; i + 8 < full_bytes; i += 8) {
+    // Load 8 bytes into an uint8x16_t
+    uint8x16_t bytes = vld1d_u8(&in[i]);
+
+    uint8x16_t index = vreinterpretq_u8_u64(base_index);
+    // We can unroll by a factor 2 by using vqtbl1q
+    for (int byte_ind = 0; byte_ind < 8; byte_ind += 2) {
+      uint8x16_t byte = vqtbl1q_u8(bytes, index);
+      // Shift the bits we want to convert into the rightmost position, and mask
+      // with 1
+      uint8x16_t new_byte = vshlq_u8(byte, shifts);
+      new_byte = vandq_u8(new_byte, mask);
+      // Next loop
+      index = vaddq_u8(index, two);
+      vst1q_u8(&out[8 * (i + byte_ind)], new_byte);
+    }
+  }
+
+  // Deal with a vector tail
+  uint8x8_t mask_tail = vdup_n_u8(1);
+  int8x8_t shift_tail = {-7, -6, -5, -4, -3, -2, -1, 0};
+  for (; i < full_bytes; ++i) {
+    // Load a byte and duplicate to 8 lanes of a vector
+    uint8x8_t byte = vld1_dup_u8(&in[i]);
+    // Shift the bit we want in each lane to the right-most
+    // position, and mask with 1
+    uint8x8_t new_bytes = vshl_u8(byte, shift_tail);
+    new_bytes = vand_u8(new_bytes, mask_tail);
+    vst1_u8(&out[8 * i], new_bytes);
+  }
+
+  // Now deal with a scalar tail
+  if ((n & 7) != 0) {
+    uint8_t byte = in[full_bytes];
+    uint32_t bit_ind = 0;
+    for (uint32_t j = 8 * i; j < n; ++j, ++bit_ind) {
+      out[j] = (byte >> (7 - bit_ind)) & 1;
+    }
+  }
+}
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead and return it in a std::vector
+inline std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
+  std::vector<uint8_t> out(n);
+  bits_to_bytes(n, in, out.data());
+  return out;
+}
+
+// Given a byte array of zeros and ones, write this out to
+// consecutive bits instead. Bytes are assumed to be big endian
+// so the first bit in a byte goes to the highest bit position
+inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    out[i] = (in[i * 8] & 1) << 7;
+    for (uint32_t j = 1; j < 8; ++j) {
+      out[i] |= (in[i * 8 + j] & 1) << (7 - j);
+    }
+  }
+  if (tail_bits != 0) {
+    out[full_bytes] = (in[full_bytes * 8] & 1) << 7;
+    for (uint32_t j = 1; j < tail_bits; ++j) {
+      out[full_bytes] |= (in[full_bytes * 8 + j] & 1) << (7 - j);
+    }
+  }
+}
+
+// Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
+// negative, otherwise to 0. We do not assume that the data_out pointer is
+// initialized
+template<typename T>
+inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    data_out[i] = 0;
+    for (uint32_t j = 0; j < 8; ++j) {
+      // The first bit to write in the byte is the most significant
+      if (llr[i * 8 + j] < 0) {
+        uint32_t bit_ind = 7 ^ j;
+        data_out[i] |= 1 << bit_ind;
+      }
+    }
+  }
+  // Deal with tail bits
+  if (tail_bits != 0) {
+    data_out[full_bytes] = 0;
+    for (uint32_t i = 0; i < tail_bits; ++i) {
+      // The first bit to write in the byte is the most significant
+      if (llr[full_bytes * 8 + i] < 0) {
+        uint32_t bit_ind = 7 ^ i;
+        data_out[full_bytes] |= 1 << bit_ind;
+      }
+    }
+  }
+}
+
+} // namespace armral
diff --git a/src/utils/bits_to_bytes.hpp b/src/utils/bits_to_bytes.hpp
index 2cc811d..99ecdd9 100644
--- a/src/utils/bits_to_bytes.hpp
+++ b/src/utils/bits_to_bytes.hpp
@@ -1,129 +1,13 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-#include "intrinsics.h"
-
-#include <arm_neon.h>
-#include <vector>
-
-namespace armral {
-
-// Given a byte array, where we are interested in each bit, create
-// an array of bytes instead in the passed-in array "out"
-// Data is read from the most significant bit in each byte to the least
-// significant
-inline void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
-  uint32_t full_bytes = n >> 3;
-  // Set the mask
-  uint8x16_t mask = vdupq_n_u8(1);
-  // When shifting, we are going to be dealing with two bytes at a time
-  int8x16_t shifts = {-7, -6, -5, -4, -3, -2, -1, 0,
-                      -7, -6, -5, -4, -3, -2, -1, 0};
-  // Set the base index for the bytes to 0 x 8, 1 x 8
-  uint64x2_t base_index = {0x0, 0x0101010101010101};
-  // Increment the index by two each iteration
-  uint8x16_t two = vdupq_n_u8(2);
-  uint32_t i = 0;
-  for (; i + 8 < full_bytes; i += 8) {
-    // Load 8 bytes into an uint8x16_t
-    uint8x16_t bytes = vld1d_u8(&in[i]);
-
-    uint8x16_t index = vreinterpretq_u8_u64(base_index);
-    // We can unroll by a factor 2 by using vqtbl1q
-    for (int byte_ind = 0; byte_ind < 8; byte_ind += 2) {
-      uint8x16_t byte = vqtbl1q_u8(bytes, index);
-      // Shift the bits we want to convert into the rightmost position, and mask
-      // with 1
-      uint8x16_t new_byte = vshlq_u8(byte, shifts);
-      new_byte = vandq_u8(new_byte, mask);
-      // Next loop
-      index = vaddq_u8(index, two);
-      vst1q_u8(&out[8 * (i + byte_ind)], new_byte);
-    }
-  }
-
-  // Deal with a vector tail
-  uint8x8_t mask_tail = vdup_n_u8(1);
-  int8x8_t shift_tail = {-7, -6, -5, -4, -3, -2, -1, 0};
-  for (; i < full_bytes; ++i) {
-    // Load a byte and duplicate to 8 lanes of a vector
-    uint8x8_t byte = vld1_dup_u8(&in[i]);
-    // Shift the bit we want in each lane to the right-most
-    // position, and mask with 1
-    uint8x8_t new_bytes = vshl_u8(byte, shift_tail);
-    new_bytes = vand_u8(new_bytes, mask_tail);
-    vst1_u8(&out[8 * i], new_bytes);
-  }
-
-  // Now deal with a scalar tail
-  if ((n & 7) != 0) {
-    uint8_t byte = in[full_bytes];
-    uint32_t bit_ind = 0;
-    for (uint32_t j = 8 * i; j < n; ++j, ++bit_ind) {
-      out[j] = (byte >> (7 - bit_ind)) & 1;
-    }
-  }
-}
-
-// Given a byte array, where we are interested in each bit, create
-// an array of bytes instead and return it in a std::vector
-inline std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
-  std::vector<uint8_t> out(n);
-  bits_to_bytes(n, in, out.data());
-  return out;
-}
-
-// Given a byte array of zeros and ones, write this out to
-// consecutive bits instead. Bytes are assumed to be big endian
-// so the first bit in a byte goes to the highest bit position
-inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
-  uint32_t full_bytes = n >> 3;
-  uint32_t tail_bits = n & 7;
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    out[i] = (in[i * 8] & 1) << 7;
-    for (uint32_t j = 1; j < 8; ++j) {
-      out[i] |= (in[i * 8 + j] & 1) << (7 - j);
-    }
-  }
-  if (tail_bits != 0) {
-    out[full_bytes] = (in[full_bytes * 8] & 1) << 7;
-    for (uint32_t j = 1; j < tail_bits; ++j) {
-      out[full_bytes] |= (in[full_bytes * 8 + j] & 1) << (7 - j);
-    }
-  }
-}
-
-// Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
-// negative, otherwise to 0. We do not assume that the data_out pointer is
-// initialized
-template<typename T>
-inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
-  uint32_t full_bytes = n >> 3;
-  uint32_t tail_bits = n & 7;
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    data_out[i] = 0;
-    for (uint32_t j = 0; j < 8; ++j) {
-      // The first bit to write in the byte is the most significant
-      if (llr[i * 8 + j] < 0) {
-        uint32_t bit_ind = 7 ^ j;
-        data_out[i] |= 1 << bit_ind;
-      }
-    }
-  }
-  // Deal with tail bits
-  if (tail_bits != 0) {
-    data_out[full_bytes] = 0;
-    for (uint32_t i = 0; i < tail_bits; ++i) {
-      // The first bit to write in the byte is the most significant
-      if (llr[full_bytes * 8 + i] < 0) {
-        uint32_t bit_ind = 7 ^ i;
-        data_out[full_bytes] |= 1 << bit_ind;
-      }
-    }
-  }
-}
-
-} // namespace armral
+#ifndef ARMRAL_ARCH_HWY
+#include "acle/bits_to_bytes.hpp"
+#else
+#include "highway/bits_to_bytes.hpp"
+#endif
\ No newline at end of file
diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp
new file mode 100644
index 0000000..91ab538
--- /dev/null
+++ b/src/utils/highway/bits_to_bytes.hpp
@@ -0,0 +1,131 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#pragma once
+
+#include <hwy/highway.h>
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+#include <cmath>
+#include <vector>
+
+namespace armral {
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead in the passed-in array "out"
+// Data is read from the most significant bit in each byte to the least
+// significant
+HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
+  const size_t bits_per_byte = 8;
+  const size_t num_vecs = n / (hn::Lanes(du8) * bits_per_byte);
+  const uint32_t tail_bits = n % (hn::Lanes(du8) * bits_per_byte);
+  const uint32_t final_bits = n % hn::Lanes(du8);
+
+  const Vec_u8 k1 = hn::Set(du8, uint8_t{0x01});
+  const Vec_u8 shifts = hn::Dup128VecFromValues(du8, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6,
+                                              5, 4, 3, 2, 1, 0);
+  const Vec_u8 base_indices = hn::Dup128VecFromValues(du8, 0, 0, 0, 0, 0, 0, 0, 0,
+                                                    1, 1, 1, 1, 1, 1, 1, 1);
+
+  // Process in 128-bit blocks
+  for (size_t i = 0; i < num_vecs; i++) {
+    // load a 128/8-bit chunk
+    Vec_u8 bytes = hn::LoadU(du8, in);
+    in += hn::Lanes(du8);
+
+    // We can process two bytes at once
+    for (size_t byte_ind = 0; byte_ind < hn::Lanes(du8); byte_ind += 2) {
+      // Generate an index to select this byte pair
+      Vec_u8 indices = hn::Add(base_indices, hn::Set(du8, byte_ind));
+      Vec_u8 repeated_bytes = hn::TableLookupBytes(bytes, indices);
+      // Shift the bits we want to convert into the rightmost position and mask out the higher bits
+      Vec_u8 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
+      hn::StoreU(spread_bits, du8, out);
+      out += hn::Lanes(du8);
+    }
+  }
+
+  // handle a partial 128-bit block
+  if (tail_bits) {
+    size_t remaining_bytes = (tail_bits + bits_per_byte - 1) / bits_per_byte;
+    Mask_u8 load_mask = hn::FirstN(du8, remaining_bytes);
+    // Load partial vector of remaining bytes
+    Vec_u8 bytes = no_sanitize::MaskedLoadU(du8, load_mask, in);
+
+    // We can process two bytes at once, stopping once we reach the end of the data
+    for (size_t byte_ind = 0; byte_ind < remaining_bytes; byte_ind += 2) {
+      // Generate an index to select this byte pair
+      Vec_u8 indices = hn::Add(base_indices, hn::Set(du8, byte_ind));
+      Vec_u8 repeated_bytes = hn::TableLookupBytes(bytes, indices);
+      // Shift the bits we want to convert into the rightmost position and mask out the higher bits
+      Vec_u8 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
+      bool store_remainder = (byte_ind + 2) >= remaining_bytes;
+      hn::StoreN(spread_bits, du8, out,
+                 (store_remainder && final_bits) ? final_bits : hn::Lanes(du8));
+      out += hn::Lanes(du8);
+    }
+  }
+}
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead and return it in a std::vector
+HWY_FORCED_INLINE std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
+  std::vector<uint8_t> out(n);
+  bits_to_bytes(n, in, out.data());
+  return out;
+}
+
+// Given a byte array of zeros and ones, write this out to
+// consecutive bits instead. Bytes are assumed to be big endian
+// so the first bit in a byte goes to the highest bit position
+HWY_FORCED_INLINE void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    out[i] = (in[i * 8] & 1) << 7;
+    for (uint32_t j = 1; j < 8; ++j) {
+      out[i] |= (in[i * 8 + j] & 1) << (7 - j);
+    }
+  }
+  if (tail_bits != 0) {
+    out[full_bytes] = (in[full_bytes * 8] & 1) << 7;
+    for (uint32_t j = 1; j < tail_bits; ++j) {
+      out[full_bytes] |= (in[full_bytes * 8 + j] & 1) << (7 - j);
+    }
+  }
+}
+
+// Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
+// negative, otherwise to 0. We do not assume that the data_out pointer is
+// initialized
+template<typename T>
+HWY_FORCED_INLINE void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    data_out[i] = 0;
+    for (uint32_t j = 0; j < 8; ++j) {
+      // The first bit to write in the byte is the most significant
+      if (llr[i * 8 + j] < 0) {
+        uint32_t bit_ind = 7 ^ j;
+        data_out[i] |= 1 << bit_ind;
+      }
+    }
+  }
+  // Deal with tail bits
+  if (tail_bits != 0) {
+    data_out[full_bytes] = 0;
+    for (uint32_t i = 0; i < tail_bits; ++i) {
+      // The first bit to write in the byte is the most significant
+      if (llr[full_bytes * 8 + i] < 0) {
+        uint32_t bit_ind = 7 ^ i;
+        data_out[full_bytes] |= 1 << bit_ind;
+      }
+    }
+  }
+}
+
+} // namespace armral
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 2ef5cab..4ab8ed5 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -85,12 +85,36 @@ using Vec_i64 = hn::Vec<decltype(di64)>;
 
 // Mask Types
 using Mask_u8 = hn::Mask<decltype(du8)>;
+using Mask_i8 = hn::Mask<decltype(di8)>;
+using Mask_u16 = hn::Mask<decltype(du16)>;
+using Mask_i16 = hn::Mask<decltype(di16)>;
+using Mask_u32 = hn::Mask<decltype(du32)>;
+using Mask_i32 = hn::Mask<decltype(di32)>;
+using Mask_u64 = hn::Mask<decltype(du64)>;
+using Mask_i64 = hn::Mask<decltype(di64)>;
 
 // Rebind Tags
-/* e.g. const hn::Rebind<int8_t, decltype(d16)> di16_di8;
-where the first tag named in the rebind tag is the old type
+/*
+Where the first tag named in the rebind tag is the old type
 which the rebind tag is created from and the second is the
-new tag type. */
+new tag type.
+*/
+const hn::Rebind<int8_t, decltype(di16)> di16_di8;
+const hn::Rebind<int8_t, decltype(di32)> di32_di8;
+
+const hn::Rebind<uint8_t, decltype(di16)> di16_du8;
+const hn::Rebind<uint8_t, decltype(du16)> du16_du8;
+const hn::Rebind<uint8_t, decltype(di32)> di32_du8;
+const hn::Rebind<uint8_t, decltype(du64)> du64_du8;
+
+// Half Vector Tags
+const hn::Half<decltype(di8)> di8_half;
+const hn::Half<decltype(di16)> di16_half;
+const hn::Half<decltype(du16)> du16_half;
+
+// Half Vector Types
+using Vec_i8_half = hn::Vec<decltype(di8_half)>;
+
 
 /* It has been found that highway implementations of
 MaskedLoad and MaskedLoadU are memory unsafe and will not
-- 
GitLab


From f244f20ea8b5c8818440ec4b88914dde2345dd28 Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Fri, 10 Jan 2025 09:00:13 +0000
Subject: [PATCH 09/20] Update the bits_to_bytes implementation so that it is
 scalable.

---
 src/utils/highway/bits_to_bytes.hpp | 44 ++++++++++++++---------------
 src/utils/hwy_types.hpp             |  3 ++
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp
index 91ab538..76ccac0 100644
--- a/src/utils/highway/bits_to_bytes.hpp
+++ b/src/utils/highway/bits_to_bytes.hpp
@@ -20,52 +20,52 @@ namespace armral {
 // significant
 HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
   const size_t bits_per_byte = 8;
-  const size_t num_vecs = n / (hn::Lanes(du8) * bits_per_byte);
-  const uint32_t tail_bits = n % (hn::Lanes(du8) * bits_per_byte);
-  const uint32_t final_bits = n % hn::Lanes(du8);
+  const size_t num_vecs = n / (hn::Lanes(du8x16) * bits_per_byte);
+  const uint32_t tail_bits = n % (hn::Lanes(du8x16) * bits_per_byte);
+  const uint32_t final_bits = n % hn::Lanes(du8x16);
 
-  const Vec_u8 k1 = hn::Set(du8, uint8_t{0x01});
-  const Vec_u8 shifts = hn::Dup128VecFromValues(du8, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6,
+  const Vec_u8x16 k1 = hn::Set(du8x16, uint8_t{0x01});
+  const Vec_u8x16 shifts = hn::Dup128VecFromValues(du8x16, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6,
                                               5, 4, 3, 2, 1, 0);
-  const Vec_u8 base_indices = hn::Dup128VecFromValues(du8, 0, 0, 0, 0, 0, 0, 0, 0,
+  const Vec_u8x16 base_indices = hn::Dup128VecFromValues(du8x16, 0, 0, 0, 0, 0, 0, 0, 0,
                                                     1, 1, 1, 1, 1, 1, 1, 1);
 
   // Process in 128-bit blocks
   for (size_t i = 0; i < num_vecs; i++) {
     // load a 128/8-bit chunk
-    Vec_u8 bytes = hn::LoadU(du8, in);
-    in += hn::Lanes(du8);
+    Vec_u8x16 bytes = hn::LoadU(du8x16, in);
+    in += hn::Lanes(du8x16);
 
     // We can process two bytes at once
-    for (size_t byte_ind = 0; byte_ind < hn::Lanes(du8); byte_ind += 2) {
+    for (size_t byte_ind = 0; byte_ind < hn::Lanes(du8x16); byte_ind += 2) {
       // Generate an index to select this byte pair
-      Vec_u8 indices = hn::Add(base_indices, hn::Set(du8, byte_ind));
-      Vec_u8 repeated_bytes = hn::TableLookupBytes(bytes, indices);
+      Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind));
+      Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices);
       // Shift the bits we want to convert into the rightmost position and mask out the higher bits
-      Vec_u8 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
-      hn::StoreU(spread_bits, du8, out);
-      out += hn::Lanes(du8);
+      Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
+      hn::StoreU(spread_bits, du8x16, out);
+      out += hn::Lanes(du8x16);
     }
   }
 
   // handle a partial 128-bit block
   if (tail_bits) {
     size_t remaining_bytes = (tail_bits + bits_per_byte - 1) / bits_per_byte;
-    Mask_u8 load_mask = hn::FirstN(du8, remaining_bytes);
+    Mask_u8x16 load_mask = hn::FirstN(du8x16, remaining_bytes);
     // Load partial vector of remaining bytes
-    Vec_u8 bytes = no_sanitize::MaskedLoadU(du8, load_mask, in);
+    Vec_u8x16 bytes = no_sanitize::MaskedLoadU(du8x16, load_mask, in);
 
     // We can process two bytes at once, stopping once we reach the end of the data
     for (size_t byte_ind = 0; byte_ind < remaining_bytes; byte_ind += 2) {
       // Generate an index to select this byte pair
-      Vec_u8 indices = hn::Add(base_indices, hn::Set(du8, byte_ind));
-      Vec_u8 repeated_bytes = hn::TableLookupBytes(bytes, indices);
+      Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind));
+      Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices);
       // Shift the bits we want to convert into the rightmost position and mask out the higher bits
-      Vec_u8 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
+      Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
       bool store_remainder = (byte_ind + 2) >= remaining_bytes;
-      hn::StoreN(spread_bits, du8, out,
-                 (store_remainder && final_bits) ? final_bits : hn::Lanes(du8));
-      out += hn::Lanes(du8);
+      hn::StoreN(spread_bits, du8x16, out,
+                 (store_remainder && final_bits) ? final_bits : hn::Lanes(du8x16));
+      out += hn::Lanes(du8x16);
     }
   }
 }
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 4ab8ed5..ac2ec4b 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -45,6 +45,9 @@ using Vec_i32x4 = hn::Vec<decltype(di32x4)>;
 using Vec_u64x2 = hn::Vec<decltype(du64x2)>;
 using Vec_i64x2 = hn::Vec<decltype(di64x2)>;
 
+// Mask Types
+using Mask_u8x16 = hn::Mask<decltype(du8x16)>;
+
 // Rebind Tags
 /* e.g. const hn::Rebind<int8_t, decltype(d16)> di16x8_di8x16;
 where the first tag named in the rebind tag is the old type
-- 
GitLab


From edb01cdad85aa3e552549ed4ebfa0f3d0089c43e Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Fri, 10 Jan 2025 11:03:33 +0000
Subject: [PATCH 10/20] Switch over to MaskedLoad

MaskedLoad is already in upstream Highway. This is implemented as an unaligned load in both SVE and x86.

Note the change in argument order.
---
 src/LowerPHY/Scrambling/highway/arm_scrambling.cpp     |  4 ++--
 src/UpperPHY/Demodulation/highway/arm_demodulation.cpp |  6 +++---
 src/UpperPHY/Modulation/highway/arm_modulation.cpp     | 10 +++++-----
 src/utils/highway/bits_to_bytes.hpp                    |  2 +-
 src/utils/hwy_types.hpp                                |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
index 1f80200..99fe133 100644
--- a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
@@ -22,8 +22,8 @@ HWY_FORCED_INLINE void xor_u8_partial(const uint8_t *__restrict &src,
                     const uint8_t *__restrict &seq, uint8_t *&dst,
                     size_t n_lanes) {
   Mask_u8 final_mask = hn::FirstN(du8, n_lanes);
-  Vec_u8 src_vec = no_sanitize::MaskedLoadU(du8, final_mask, src);
-  Vec_u8 seq_vec = no_sanitize::MaskedLoadU(du8, final_mask, seq);
+  Vec_u8 src_vec = no_sanitize::MaskedLoad(final_mask, du8, src);
+  Vec_u8 seq_vec = no_sanitize::MaskedLoad(final_mask, du8, seq);
   hn::StoreN(hn::Xor(src_vec, seq_vec), du8, dst, n_lanes);
 }
 
diff --git a/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
index 72c0779..4ae1002 100644
--- a/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
+++ b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
@@ -83,7 +83,7 @@ armral_demodulation_qpsk(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     Vec_i8_half llr8_a = generate_partial_llrs_half_vect(rec_a, weight_v);
 
@@ -175,7 +175,7 @@ armral_demodulation_16qam(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     /* Computing L(c0/r) and L(c1/r) */
     Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
@@ -428,7 +428,7 @@ armral_demodulation_256qam(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoadU(di16, load_mask, (const int16_t *)p_src);
+    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     /* Computing L(c0/r) and L(c1/r) */
     Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
diff --git a/src/UpperPHY/Modulation/highway/arm_modulation.cpp b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
index c225621..5d7ecc0 100644
--- a/src/UpperPHY/Modulation/highway/arm_modulation.cpp
+++ b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
@@ -70,7 +70,7 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   index = hn::InterleaveWholeLower(du16, index, index);
   index = hn::InterleaveWholeLower(du16, index, index);
   for (uint32_t i = 0; i < unrolls; i++) {
-    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoadU(du16_du8, pred, p_src));
+    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoad(pred, du16_du8, p_src));
     p_src += vl;
     Vec_u16 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
     Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
@@ -83,7 +83,7 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   const hn::Mask<decltype(du16_du8)> load_lanes = hn::FirstN(du16_du8,  leftover_bytes);
   const uint32_t active_store_lanes = leftover_bytes * 8;
   if (leftover_bytes != 0) {
-    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoadU(du16_du8, load_lanes, p_src));
+    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoad(load_lanes, du16_du8, p_src));
     p_src += leftover_bytes;
     Vec_u16 tbl = hn::TableLookupLanes(src_bytes,  hn::IndicesFromVec(du16, index));
     Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
@@ -417,7 +417,7 @@ void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
       uint32_t tail_size = blk_cnt - i;
       Mask_i64 pred_i64 = hn::FirstN(di64, tail_size);
       hn::Mask<decltype(du64_du8)> pred_u8 = hn::FirstN(du64_du8, tail_size);
-      Vec_i64 svsample = hn::PromoteTo(di64, no_sanitize::MaskedLoadU(du64_du8, pred_u8, p_src));
+      Vec_i64 svsample = hn::PromoteTo(di64, no_sanitize::MaskedLoad(pred_u8, du64_du8, p_src));
       p_src += blk_cnt - i;
       Vec_i64 gather = hn::MaskedGatherIndex(pred_i64, di64,
           (const int64_t *)constellation_16qam_outer_prod, svsample);
@@ -521,7 +521,7 @@ void armral_64qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   Vec_i32 byte_mask = hn::Dup128VecFromValues(di32, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
   const uint32_t svunroll_cnt = bytes / vl;
   for (uint32_t i = 0; i < svunroll_cnt; i++) {
-    Vec_u8 src_bytes = no_sanitize::MaskedLoadU(du8, pred, p_src);
+    Vec_u8 src_bytes = no_sanitize::MaskedLoad(pred, du8, p_src);
     Vec_u8 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du8, hn::BitCast(du8, index)));
     Vec_i32 data = hn::BitCast(di32, tbl);
     // Mask out the most significant byte of each 32 bit value.
@@ -726,7 +726,7 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   const uint32_t leftover_bytes = bytes - unr_cnt * vl;
   if (leftover_bytes != 0U) {
     hn::Mask<decltype(di32_du8)> pred = hn::FirstN(di32_du8, leftover_bytes);
-    Vec_i32 index = hn::PromoteTo(di32, no_sanitize::MaskedLoadU(di32_du8, pred, p_src));
+    Vec_i32 index = hn::PromoteTo(di32, no_sanitize::MaskedLoad(pred, di32_du8, p_src));
     Vec_i32 gather = hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
     hn::StoreN(gather, di32, (int32_t *)p_dst, leftover_bytes);
   }
diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp
index 76ccac0..633667d 100644
--- a/src/utils/highway/bits_to_bytes.hpp
+++ b/src/utils/highway/bits_to_bytes.hpp
@@ -53,7 +53,7 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out
     size_t remaining_bytes = (tail_bits + bits_per_byte - 1) / bits_per_byte;
     Mask_u8x16 load_mask = hn::FirstN(du8x16, remaining_bytes);
     // Load partial vector of remaining bytes
-    Vec_u8x16 bytes = no_sanitize::MaskedLoadU(du8x16, load_mask, in);
+    Vec_u8x16 bytes = no_sanitize::MaskedLoad(load_mask, du8x16, in);
 
     // We can process two bytes at once, stopping once we reach the end of the data
     for (size_t byte_ind = 0; byte_ind < remaining_bytes; byte_ind += 2) {
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index ac2ec4b..4f976d4 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -133,7 +133,7 @@ from sanitization.
 namespace no_sanitize {
 template<class D, class M>
 LOAD_ATTR hn::VFromD<D>
-MaskedLoadU(D d, M m, const hn::TFromD<D> *HWY_RESTRICT unaligned) {
-  return hn::MaskedLoadU(d, m, unaligned);
+MaskedLoad(M m, D d, const hn::TFromD<D> *HWY_RESTRICT unaligned) {
+  return hn::MaskedLoad(m, d, unaligned);
 }
 } // namespace no_sanitize
\ No newline at end of file
-- 
GitLab


From 5ef3b7fc39e061cc22ef45cdde85da5f75dcaa0a Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Fri, 10 Jan 2025 11:31:36 +0000
Subject: [PATCH 11/20] Port LDPC encode & decode to Highway

The SVE2 implementation was used for the port which shows some performance improvements on NEON and SVE over the original implementation.
---
 CMakeLists.txt                             |  20 +-
 armral_hwy.cmake.in                        |   8 +-
 src/UpperPHY/LDPC/highway/ldpc_decoder.cpp | 967 +++++++++++++++++++++
 src/UpperPHY/LDPC/highway/ldpc_encoder.cpp | 813 +++++++++++++++++
 src/UpperPHY/LDPC/ldpc_encoder.cpp         | 929 +-------------------
 src/UpperPHY/LDPC/ldpc_tables.hpp          | 934 ++++++++++++++++++++
 test/UpperPHY/LDPC/Decoding/main.cpp       |   1 -
 7 files changed, 2729 insertions(+), 943 deletions(-)
 create mode 100644 src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
 create mode 100644 src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
 create mode 100644 src/UpperPHY/LDPC/ldpc_tables.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc25495..d38db8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -431,10 +431,10 @@ if(BUILD_TESTING)
     # add_armral_test(tail_biting_convolutional_encoding
     #                 test/UpperPHY/ConvolutionalEncoder/main.cpp)
     add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-    # add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-    # add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-    # add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-    # add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+    add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+    add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+    add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+    add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
     add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
     # add_armral_test(polar_crc_attachment
     #                 test/UpperPHY/Polar/CrcAttachment/main.cpp)
@@ -641,12 +641,12 @@ if(BUILD_TESTING)
     # add_armral_bench(tail_biting_convolutional_encoding
     #                  bench/UpperPHY/ConvolutionalEncoder/main.cpp)
     add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-    # add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-    # add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-    # add_armral_bench(ldpc_rate_matching
-    #                  bench/UpperPHY/LDPC/RateMatching/main.cpp)
-    # add_armral_bench(ldpc_rate_recovery
-    #                  bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+    add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+    add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+    add_armral_bench(ldpc_rate_matching
+                     bench/UpperPHY/LDPC/RateMatching/main.cpp)
+    add_armral_bench(ldpc_rate_recovery
+                     bench/UpperPHY/LDPC/RateRecovery/main.cpp)
     add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
     # add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
     # add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index 0d4bc8c..c500d43 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -145,10 +145,10 @@ set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
diff --git a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
new file mode 100644
index 0000000..5bcdeda
--- /dev/null
+++ b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
@@ -0,0 +1,967 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+
+#include "../ldpc_coding.hpp"
+#include "utils/allocators.hpp"
+#include "utils/bits_to_bytes.hpp"
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+#include <cmath>
+#include <cstring>
+#include <optional>
+
+namespace {
+using Mask_i16 = hn::Mask<decltype(di16)>;
+
+inline int16_t __attribute__((always_inline)) sat_abs_16(int16_t a) {
+  int16_t partial_res = abs((int32_t)a);
+  if (partial_res > INT8_MAX) {
+    return INT8_MAX;
+  }
+  if (partial_res < INT8_MIN) {
+    return INT8_MIN;
+  }
+  return partial_res;
+}
+
+inline int16_t __attribute__((always_inline)) sat_add_16(int16_t a, int16_t b) {
+  int16_t partial_res = (uint32_t)a + (uint32_t)b;
+  if (partial_res > INT8_MAX) {
+    return INT8_MAX;
+  }
+  if (partial_res < INT8_MIN) {
+    return INT8_MIN;
+  }
+  return partial_res;
+}
+
+inline int16_t __attribute__((always_inline)) sat_sub_16(int16_t a, int16_t b) {
+  int16_t partial_res = (int32_t)a - (int32_t)b;
+  if (partial_res > INT8_MAX) {
+    return INT8_MAX;
+  }
+  if (partial_res < INT8_MIN) {
+    return INT8_MIN;
+  }
+  return partial_res;
+}
+
+struct ldpc_layer_data {
+  uint32_t z;
+  uint32_t lsi;
+  uint32_t row;
+  uint32_t row_start_ind;
+  const armral_ldpc_base_graph_t *graph;
+  uint32_t num_cols;
+  const uint32_t *shift_ptr;
+  const uint32_t *col_ptr;
+
+  ldpc_layer_data(uint32_t z_in, uint32_t lsi_in,
+                  const armral_ldpc_base_graph_t *graph_in)
+    : z(z_in), lsi(lsi_in), row(0), row_start_ind(0), graph(graph_in),
+      num_cols(graph->row_start_inds[1]),
+      shift_ptr(graph->shifts + lsi * num_cols), col_ptr(graph->col_inds) {}
+
+  void next() {
+    row++;
+    row_start_ind = graph->row_start_inds[row];
+    col_ptr += num_cols;
+    num_cols = graph->row_start_inds[row + 1] - row_start_ind;
+    shift_ptr = graph->shifts + row_start_ind * armral::ldpc::num_lifting_sets +
+                lsi * num_cols;
+  }
+};
+
+template<typename T>
+inline T max(T a, T b) {
+  return a > b ? a : b;
+}
+
+template<typename T>
+inline T min(T a, T b) {
+  return a < b ? a : b;
+}
+
+enum lifting_size_category { CAT_TINY, CAT_TAIL, CAT_LARGE };
+
+template<typename Allocator>
+class crc_checker {
+public:
+  crc_checker(uint32_t z, uint32_t crc_idx, Allocator &allocator) : m_z(z) {
+    // Calculate K', which is the number of info bits + CRC bits (i.e. the
+    // non-filler bits of the code block)
+    m_k_prime = crc_idx + 24;
+
+    // The CRC calculation routine expects a particular size of input (n % 16 = 0
+    // where n is the number of bytes), which requires padding the input to the
+    // required size
+    m_buffer_size = (m_k_prime + 7) / 8;
+    m_total_bits = m_k_prime;
+    if (m_k_prime % 128 != 0) {
+      m_num_pad_bits = 128 - (m_k_prime % 128);
+      m_total_bits = m_k_prime + m_num_pad_bits;
+      m_buffer_size = m_total_bits >> 3;
+    }
+
+    m_llrs = allocate_uninitialized<int16_t>(allocator, m_total_bits + m_z - 1);
+    m_buffer = allocate_uninitialized<uint8_t>(allocator, m_buffer_size);
+  }
+
+  bool check(const int16_t *new_llrs) {
+    // Copy the LLRs corresponding to the bits we need to do the CRC check after
+    // the padding bits
+    memset(m_llrs.get(), 0, m_num_pad_bits * sizeof(int16_t));
+    for (uint32_t num_block = 0; num_block < ((m_k_prime + m_z - 1) / m_z);
+         num_block++) {
+      memcpy(m_llrs.get() + m_num_pad_bits + (num_block * m_z),
+             new_llrs + (2 * num_block * m_z), m_z * sizeof(int16_t));
+    }
+
+    // Hard decode
+    armral::llrs_to_bits(m_total_bits, m_llrs.get(), m_buffer.get());
+
+    // Generate the CRC parity bits
+    uint64_t crc;
+    armral_crc24_b_be(m_buffer_size, (const uint64_t *)m_buffer.get(), &crc);
+
+    // If the CRC is zero then the code block has been correctly decoded and we
+    // can terminate the iterations early
+    return (crc == 0);
+  }
+
+private:
+  uint32_t m_z{0};
+  uint32_t m_k_prime{0};
+  uint32_t m_buffer_size{0};
+  uint32_t m_num_pad_bits{0};
+  uint32_t m_total_bits{0};
+  unique_ptr<Allocator, int16_t> m_llrs;
+  unique_ptr<Allocator, uint8_t> m_buffer;
+};
+
+template<lifting_size_category>
+bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi,
+                  const armral_ldpc_base_graph_t *graph, int32_t num_lanes,
+                  int32_t full_vec, uint32_t tail_size, int16_t *check);
+
+template<>
+bool parity_check<CAT_TINY>(const int16_t *llrs, uint32_t z, uint32_t lsi,
+                            const armral_ldpc_base_graph_t *graph,
+                            int32_t num_lanes, int32_t full_vec,
+                            uint32_t tail_size, int16_t *check) {
+  // Loop through the rows in the base graph
+  bool passed = true;
+  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
+    auto row_start_ind = graph->row_start_inds[row];
+    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * num_cols;
+    // Loop through the rows in the block
+    for (uint32_t zb = 0; zb < z && passed; ++zb) {
+      // Loop through the columns in the row
+      int16_t scal_check = 0;
+      for (uint32_t col = 0; col < num_cols; ++col) {
+        auto shift = (shift_ptr[col] + zb) % z;
+        auto codeword_ind = col_ptr[col] * z + shift;
+        scal_check ^= llrs[codeword_ind];
+      }
+      passed &= scal_check >= 0;
+    }
+  }
+  return passed;
+}
+
+template<>
+bool parity_check<CAT_TAIL>(const int16_t *llrs, uint32_t z, uint32_t lsi,
+                            const armral_ldpc_base_graph_t *graph,
+                            int32_t num_lanes, int32_t full_vec,
+                            uint32_t tail_size, int16_t *check) {
+  // Loop through the rows in the base graph
+  bool passed = true;
+  Mask_i16 pg_tail = hn::FirstN(di16, (size_t)tail_size);
+
+  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
+    auto row_start_ind = graph->row_start_inds[row];
+    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * num_cols;
+    memset(check, 0, z * sizeof(int16_t));
+
+    // Loop through the columns
+    for (uint32_t col = 0; col < num_cols; ++col) {
+      auto shift = (shift_ptr[col] % z);
+      auto codeword_ind = col_ptr[col] * (2 * z) + shift;
+
+      // No need to loop here, as there is only a tail
+      const int16_t *llrs_ptr = llrs + codeword_ind;
+      int16_t *check_ptr = check;
+
+      Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+      Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr);
+      Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
+      hn::StoreN(result_reg, di16, check_ptr, tail_size);
+    }
+    for (uint32_t zb = 0; zb < z && passed; ++zb) {
+      passed &= check[zb] >= 0;
+    }
+  }
+  return passed;
+}
+
+template<>
+bool parity_check<CAT_LARGE>(const int16_t *llrs, uint32_t z, uint32_t lsi,
+                             const armral_ldpc_base_graph_t *graph,
+                             int32_t num_lanes, int32_t full_vec,
+                             uint32_t tail_size, int16_t *check) {
+  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
+
+  // Loop through the rows in the base graph
+  bool passed = true;
+  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
+    auto row_start_ind = graph->row_start_inds[row];
+    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * num_cols;
+    memset(check, 0, z * sizeof(int16_t));
+
+    // Loop through the columns
+    for (uint32_t col = 0; col < num_cols; ++col) {
+      auto shift = (shift_ptr[col] % z);
+      auto codeword_ind = col_ptr[col] * (2 * z) + shift;
+      // Loop through the rows in the block
+
+      // The check can be done on the LLRs instead of on the bit values, as
+      // there is a one-to-one transform between LLRs and bit. Negative LLRs
+      // represent a hard decision for the bit to be one, and non-negative
+      // values represent a zero. Hence the check needs to xor all LLRs
+      // and then assert that the result is non-negative.
+      const int16_t *llrs_ptr = llrs + codeword_ind;
+      int16_t *check_ptr = check;
+
+      for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
+        Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
+        Vec_i16 check_reg = hn::LoadU(di16, check_ptr);
+        Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
+        hn::StoreU(result_reg, di16, check_ptr);
+
+        // Increment pointers
+        llrs_ptr += num_lanes;
+        check_ptr += num_lanes;
+      }
+      // Process tail
+      if (tail_size != 0) {
+        Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+        Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr);
+        Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
+        hn::StoreN(result_reg, di16, check_ptr, tail_size);
+      }
+    }
+    for (uint32_t zb = 0; zb < z && passed; ++zb) {
+      passed &= check[zb] >= 0;
+    }
+  }
+  return passed;
+}
+
+// For each check node m in the layer, compute:
+// - the variable-to-check-node messages L(n,m) for each variable node n in
+//   \psi(m), where \psi(m) is the set of variable nodes connected to m:
+//   L(n,m) = LLR(n) - R(n,m)
+// - the products \prod_{n' \in \psi(m)} L(n',m) (they will be used to compute
+//   sign(R(n,m)) in a second step)
+// - \min_{n \in \psi(m)} |L(n,m)| and the second minimum (they will be used to
+//   compute |R(n,m)| in a second step)
+template<lifting_size_category>
+void compute_l_product_min1_and_min2(
+    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
+    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
+    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
+    int16_t *row_sign_array);
+
+template<>
+void compute_l_product_min1_and_min2<CAT_TINY>(
+    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
+    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
+    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
+    int16_t *row_sign_array) {
+  const auto *r_ptr = r;
+  // Loop through the Z rows in the layer (check node m)
+  for (uint32_t zb = 0; zb < d->z; ++zb) {
+    // Loop through the columns in the row (variable node n in psi(m))
+    // Column 0
+    auto shift = (d->shift_ptr[0] + zb) % d->z;
+    int16_t l_val = sat_sub_16(llrs[d->col_ptr[0] * d->z + shift], *(r_ptr++));
+
+    int16_t row_sign = l_val;
+
+    int16_t row_min = sat_abs_16(l_val);
+
+    *(l++) = l_val;
+
+    // Column 1
+    shift = (d->shift_ptr[1] + zb) % d->z;
+    l_val = sat_sub_16(llrs[d->col_ptr[1] * d->z + shift], *(r_ptr++));
+
+    row_sign ^= l_val;
+
+    int16_t abs_val = sat_abs_16(l_val);
+    int16_t row_min2 = max(row_min, abs_val);
+    row_min = min(row_min, abs_val);
+
+    *(l++) = l_val;
+
+    // Columns n >= 2
+    for (uint32_t col = 2; col < d->num_cols; ++col) {
+      // Compute L(n,m) = LLR(n) - R(n,m)
+      shift = (d->shift_ptr[col] + zb) % d->z;
+      l_val = sat_sub_16(llrs[d->col_ptr[col] * d->z + shift], *(r_ptr++));
+
+      // Compute the product of L(n',m), for all the columns (all n' in psi(m))
+      row_sign ^= l_val;
+
+      // Compute the min(|L(n,m)|) and the second minimum
+      abs_val = sat_abs_16(l_val);
+      row_min2 = max(row_min, min(row_min2, abs_val));
+      row_min = min(row_min, abs_val);
+
+      // Store L(n,m)
+      *(l++) = l_val;
+    }
+
+    // Store the two minima and the product for Z rows
+    row_min_array[zb] = row_min;
+    row_min2_array[zb] = row_min2;
+    row_sign_array[zb] = row_sign;
+  }
+}
+
+template<>
+void compute_l_product_min1_and_min2<CAT_TAIL>(
+    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
+    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
+    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
+    int16_t *row_sign_array) {
+  // Case for lifting sizes Z such as 8 <= Z < 16
+  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
+
+  // Loop through the columns in the row (variable node n in psi(m))
+  // Column 0
+  int16_t *l_ptr = l;
+  auto shift = d->shift_ptr[0] % d->z;
+  const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift;
+  const int16_t *r_ptr = r;
+
+  Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+  Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+  Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+  Vec_i16 row_sign = l_reg;
+
+  Vec_i16 row_min = hn::SaturatedAbs(l_reg);
+
+  hn::StoreN(l_reg, di16, l_ptr, tail_size);
+
+  // Column 1
+  l_ptr = l + d->z;
+  shift = d->shift_ptr[1] % d->z;
+  llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift;
+  r_ptr = r + d->z;
+
+  r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+  llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+  l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+  row_sign = hn::Xor(row_sign, l_reg);
+
+  Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+  Vec_i16 row_min2 = hn::Max(row_min, abs_reg);
+  row_min = hn::Min(row_min, abs_reg);
+
+  hn::StoreN(l_reg, di16, l_ptr, tail_size);
+
+  // Columns n >= 2
+  for (uint32_t col = 2; col < d->num_cols; ++col) {
+    l_ptr = l + d->z * col;
+    shift = d->shift_ptr[col] % d->z;
+    llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift;
+    r_ptr = r + d->z * col;
+
+    // Compute L(n,m) = LLR(n) - R(n,m)
+    r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+    llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+    l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+    // Compute the product of L(n',m), for all the columns (all n' in psi(m))
+    row_sign = hn::Xor(row_sign, l_reg);
+
+    // Compute the min(|L(n,m)|) and the second minimum
+    abs_reg = hn::SaturatedAbs(l_reg);
+    row_min2 = hn::Max(row_min, hn::Min(row_min2, abs_reg));
+    row_min = hn::Min(row_min, abs_reg);
+
+    // Store L(n,m)
+    hn::StoreN(l_reg, di16, l_ptr, tail_size);
+  }
+
+  // Store the two minima and the product for Z rows
+  hn::StoreN(row_min, di16, row_min_array, tail_size);
+  hn::StoreN(row_min2, di16, row_min2_array, tail_size);
+  hn::StoreN(row_sign, di16, row_sign_array, tail_size);
+}
+
+template<>
+void compute_l_product_min1_and_min2<CAT_LARGE>(
+    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
+    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
+    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
+    int16_t *row_sign_array) {
+  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
+
+  // Loop through the columns in the row (variable node n in psi(m))
+  // Column 0
+  int16_t *l_ptr = l;
+  auto shift = d->shift_ptr[0] % d->z;
+  const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift;
+  const int16_t *r_ptr = r;
+  int16_t *sign_ptr = row_sign_array;
+  int16_t *min_ptr = row_min_array;
+
+  for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
+    Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
+    Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
+    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+    hn::StoreU(l_reg, di16, sign_ptr);
+
+    hn::StoreU(hn::SaturatedAbs(l_reg), di16, min_ptr);
+
+    hn::StoreU(l_reg, di16, l_ptr);
+
+    sign_ptr += num_lanes;
+    min_ptr += num_lanes;
+    r_ptr += num_lanes;
+    l_ptr += num_lanes;
+    llrs_ptr += num_lanes;
+  }
+
+  if (tail_size != 0) {
+    Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+    Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+    hn::StoreN(l_reg, di16, sign_ptr, tail_size);
+
+    hn::StoreN(hn::SaturatedAbs(l_reg), di16, min_ptr, tail_size);
+
+    hn::StoreN(l_reg, di16, l_ptr, tail_size);
+  }
+
+  // Column 1
+  shift = d->shift_ptr[1] % d->z;
+  l_ptr = l + d->z;
+  llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift;
+  r_ptr = r + d->z;
+  sign_ptr = row_sign_array;
+  min_ptr = row_min_array;
+  int16_t *min2_ptr = row_min2_array;
+
+  for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
+    Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
+    Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
+    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+    Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
+    hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr);
+
+    Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
+    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+    hn::StoreU(hn::Max(min_reg, abs_reg), di16, min2_ptr);
+    hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr);
+
+    hn::StoreU(l_reg, di16, l_ptr);
+
+    sign_ptr += num_lanes;
+    min_ptr += num_lanes;
+    min2_ptr += num_lanes;
+    r_ptr += num_lanes;
+    l_ptr += num_lanes;
+    llrs_ptr += num_lanes;
+  }
+
+  if (tail_size != 0) {
+    Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+    Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+    Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
+    hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size);
+
+    Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
+    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+    hn::StoreN(hn::Max(min_reg, abs_reg), di16, min2_ptr, tail_size);
+    hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size);
+
+    hn::StoreN(l_reg, di16, l_ptr, tail_size);
+  }
+
+  // Columns n >= 2
+  for (uint32_t col = 2; col < d->num_cols; ++col) {
+    l_ptr = l + d->z * col;
+    shift = d->shift_ptr[col] % d->z;
+    llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift;
+    r_ptr = r + d->z * col;
+    sign_ptr = row_sign_array;
+    min_ptr = row_min_array;
+    min2_ptr = row_min2_array;
+
+    // Loop through the Z rows in the layer (check node m)
+    for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
+      // Compute L(n,m) = LLR(n) - R(n,m)
+      Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
+      Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
+      Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+      // Compute the product of L(n',m), for all the columns (all n' in psi(m))
+      Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
+      hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr);
+
+      // Compute the min(|L(n,m)|) and the second minimum
+      Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
+      Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr);
+      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+      hn::StoreU(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr);
+      hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr);
+
+      // Store L(n,m)
+      hn::StoreU(l_reg, di16, l_ptr);
+
+      sign_ptr += num_lanes;
+      min_ptr += num_lanes;
+      min2_ptr += num_lanes;
+      r_ptr += num_lanes;
+      l_ptr += num_lanes;
+      llrs_ptr += num_lanes;
+    }
+
+    // Process tail
+    if (tail_size != 0) {
+      Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
+      Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
+      Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
+
+      Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
+      hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size);
+
+      Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
+      Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr);
+      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+      hn::StoreN(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr,
+                 tail_size);
+      hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size);
+
+      hn::StoreN(l_reg, di16, l_ptr, tail_size);
+    }
+  }
+}
+
+// For each check node m in the layer, compute:
+// - The check-to-variable-node messages R(n,m) for each n in \psi(m), where
+//   \psi(m) is the set of variable nodes connected to check node m:
+//   sign(R(n,m)) = \prod_{n' \in \psi(m)/n} sign(L(n',m)) =
+//                = \prod_{n' \in \psi(m)} sign(L(n',m)) / sign(L(n,m))
+//   |R(n,m)| = \min_{n' \in \psi(m)/n} |L(n',m)| =
+//            = the first minimum when n' != n, the second minimum otherwise
+// - The log likelihood ratios for each n in \psi(m):
+//   LLR(n) = R(n,m) + L(n,m)
+template<lifting_size_category>
+void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs,
+                        const ldpc_layer_data *d, int32_t num_lanes,
+                        int32_t full_vec, uint32_t tail_size,
+                        const int16_t *row_min_array,
+                        const int16_t *row_min2_array,
+                        const int16_t *row_sign_array);
+
+template<>
+void compute_r_and_llrs<CAT_TINY>(const int16_t *l, int16_t *r, int16_t *llrs,
+                                  const ldpc_layer_data *d, int32_t num_lanes,
+                                  int32_t full_vec, uint32_t tail_size,
+                                  const int16_t *row_min_array,
+                                  const int16_t *row_min2_array,
+                                  const int16_t *row_sign_array) {
+  // Loop through the Z rows in the layer (check node m)
+  for (uint32_t zb = 0; zb < d->z; ++zb) {
+    const int16_t *l_ptr = l + zb * d->num_cols;
+    // Loop through the columns in the row (variable node n in psi(m))
+    for (uint32_t col = 0; col < d->num_cols; ++col) {
+      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
+      int16_t col_sign = (row_sign_array[zb] ^ l_ptr[col]) < 0 ? -1 : 1;
+
+      // Compute R(n,m)
+      int16_t abs_val = sat_abs_16(l_ptr[col]);
+      int16_t r_val =
+          col_sign * (abs_val == row_min_array[zb] ? row_min2_array[zb]
+                                                   : row_min_array[zb]);
+
+      // Compute LLR(n) = R(n,m) + L(n,m)
+      auto shift = (d->shift_ptr[col] + zb) % d->z;
+      auto col_ind = d->col_ptr[col] * d->z + shift;
+      llrs[col_ind] = sat_add_16(r_val, l_ptr[col]);
+
+      // Store R(n,m) for the next iteration
+      r[col] = r_val;
+    }
+  }
+}
+
+template<>
+void compute_r_and_llrs<CAT_TAIL>(const int16_t *l, int16_t *r, int16_t *llrs,
+                                  const ldpc_layer_data *d, int32_t num_lanes,
+                                  int32_t full_vec, uint32_t tail_size,
+                                  const int16_t *row_min_array,
+                                  const int16_t *row_min2_array,
+                                  const int16_t *row_sign_array) {
+  // Case for lifting sizes 4 <= Z < 8 (rows in the layer)
+  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
+
+  Vec_i16 row_min = no_sanitize::MaskedLoad(pg_tail, di16, row_min_array);
+  Vec_i16 row_min2 = no_sanitize::MaskedLoad(pg_tail, di16, row_min2_array);
+  Vec_i16 row_sign = no_sanitize::MaskedLoad(pg_tail, di16, row_sign_array);
+
+  // Loop through the columns in the row (variable node n in psi(m))
+  for (uint32_t col = 0; col < d->num_cols; ++col) {
+    auto shift = d->shift_ptr[col] % d->z;
+    auto col_ind = d->col_ptr[col] * (2 * d->z);
+    int16_t *r_ptr = r + d->z * col;
+    const int16_t *l_ptr = l + d->z * col;
+    int16_t *llrs_ptr = llrs + col_ind + shift;
+
+    // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
+    Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr);
+    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+    Vec_i16 eor_reg = hn::Xor(row_sign, l_reg);
+    Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16));
+
+    // Compute R(n,m)
+    Mask_i16 pg_tail_eq = hn::Eq(abs_reg, row_min);
+    Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, row_min2, row_min);
+    Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg);
+
+    // Compute LLR(n) = R(n,m) + L(n,m)
+    Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
+    hn::StoreN(result, di16, llrs_ptr, tail_size);
+
+    // Store R(n,m) for the next iteration
+    hn::StoreN(r_reg, di16, r_ptr, tail_size);
+
+    // Rearrange LLRs
+    memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t));
+    // copy (z - shift) elts in the main block to the replicated block
+    memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift,
+           (d->z - shift) * sizeof(int16_t));
+  }
+}
+
+template<>
+void compute_r_and_llrs<CAT_LARGE>(const int16_t *l, int16_t *r, int16_t *llrs,
+                                   const ldpc_layer_data *d, int32_t num_lanes,
+                                   int32_t full_vec, uint32_t tail_size,
+                                   const int16_t *row_min_array,
+                                   const int16_t *row_min2_array,
+                                   const int16_t *row_sign_array) {
+  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
+
+  // Loop through the columns in the row (variable node n in psi(m))
+  for (uint32_t col = 0; col < d->num_cols; ++col) {
+    auto shift = d->shift_ptr[col] % d->z;
+    auto col_ind = d->col_ptr[col] * (2 * d->z);
+    int16_t *llrs_ptr = llrs + col_ind + shift;
+    const int16_t *l_ptr = l + d->z * col;
+    int16_t *r_ptr = r + d->z * col;
+    const int16_t *sign_ptr = row_sign_array;
+    const int16_t *min_ptr = row_min_array;
+    const int16_t *min2_ptr = row_min2_array;
+
+    // Loop through the Z rows in the layer (check node m)
+    for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
+      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
+      Vec_i16 l_reg = hn::LoadU(di16, l_ptr);
+      Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
+      Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg);
+      Mask_i16 pg_neg = hn::Lt(eor_reg, hn::Zero(di16));
+
+      // Compute R(n,m)
+      Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
+      Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr);
+      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+      Mask_i16 pg_eq = hn::Eq(abs_reg, min_reg);
+      Vec_i16 tmp_reg = hn::IfThenElse(pg_eq, min2_reg, min_reg);
+      Vec_i16 r_reg = hn::IfThenElse(pg_neg, hn::Neg(tmp_reg), tmp_reg);
+
+      // Compute LLR(n) = R(n,m) + L(n,m)
+      Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
+      hn::StoreU(result, di16, llrs_ptr);
+
+      // Store R(n,m) for the next iteration
+      hn::StoreU(r_reg, di16, r_ptr);
+
+      // Increment pointers
+      l_ptr += num_lanes;
+      r_ptr += num_lanes;
+      llrs_ptr += num_lanes;
+      sign_ptr += num_lanes;
+      min_ptr += num_lanes;
+      min2_ptr += num_lanes;
+    }
+
+    if (tail_size != 0) {
+      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
+      Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr);
+      Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
+      Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg);
+      Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16));
+
+      // Compute R(n,m)
+      Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
+      Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr);
+      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
+      Mask_i16 pg_tail_eq = hn::Eq(abs_reg, min_reg);
+      Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, min2_reg, min_reg);
+      Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg);
+
+      // Compute LLR(n) = R(n,m) + L(n,m)
+      Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
+      hn::StoreN(result, di16, llrs_ptr, tail_size);
+
+      // Store R(n,m) for the next iteration
+      hn::StoreN(r_reg, di16, r_ptr, tail_size);
+    }
+
+    // Rearrange LLRs
+    // copy shifted elements in the replicated block
+    // back to the beginning of the main block
+    memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t));
+    // copy (z - shift) elts in the main block to the replicated block
+    memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift,
+           (d->z - shift) * sizeof(int16_t));
+  }
+}
+
+template<lifting_size_category Cat, typename Allocator>
+void __attribute__((flatten))
+run_iterations(uint32_t num_its, uint32_t z, uint32_t lsi,
+               const armral_ldpc_base_graph_t *graph, int16_t *r, int16_t *l,
+               int16_t *new_llrs, int32_t num_lanes, int32_t full_vec,
+               uint32_t tail_size, int16_t *row_min_array,
+               int16_t *row_min2_array, int16_t *row_sign_array, int16_t *check,
+               bool check_convergence,
+               std::optional<crc_checker<Allocator>> &crc_checker) {
+  for (uint32_t i = 0; i < num_its; ++i) {
+    ldpc_layer_data d(z, lsi, graph);
+    auto *r_ptr = r;
+
+    // Loop through the layers (groups of Z rows)
+    compute_l_product_min1_and_min2<Cat>(l, new_llrs, r_ptr, &d, num_lanes,
+                                         full_vec, tail_size, row_min_array,
+                                         row_min2_array, row_sign_array);
+    compute_r_and_llrs<Cat>(l, r_ptr, new_llrs, &d, num_lanes, full_vec,
+                            tail_size, row_min_array, row_min2_array,
+                            row_sign_array);
+
+    for (uint32_t row = 1; row < graph->nrows; ++row) {
+      d.next();
+      r_ptr = r + d.row_start_ind * z;
+
+      // Variable-to-check node messages update
+      compute_l_product_min1_and_min2<Cat>(l, new_llrs, r_ptr, &d, num_lanes,
+                                           full_vec, tail_size, row_min_array,
+                                           row_min2_array, row_sign_array);
+      // LLRs update
+      compute_r_and_llrs<Cat>(l, r_ptr, new_llrs, &d, num_lanes, full_vec,
+                              tail_size, row_min_array, row_min2_array,
+                              row_sign_array);
+    }
+
+    // CRC check and early termination
+    bool crc_passed = crc_checker.has_value() && crc_checker->check(new_llrs);
+    if (check_convergence &&
+        (crc_passed || parity_check<Cat>(new_llrs, z, lsi, graph, num_lanes,
+                                         full_vec, tail_size, check))) {
+      break;
+    }
+  }
+}
+
+} // anonymous namespace
+
+template<bool check_convergence, typename Allocator>
+void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
+                                uint32_t z, uint32_t crc_idx, uint32_t num_its,
+                                uint8_t *data_out, Allocator &allocator) {
+  // Get the base graph and the lifting size
+  const auto *graph = armral_ldpc_get_base_graph(bg);
+  uint32_t lsi = get_lifting_index(z);
+
+  // Only allocate the CRC checker if necessary.
+  std::optional<crc_checker<Allocator>> maybe_crc_checker;
+  if (crc_idx != ARMRAL_LDPC_NO_CRC) {
+    maybe_crc_checker = crc_checker{z, crc_idx, allocator};
+  }
+
+  const uint32_t num_llrs = (graph->ncodeword_bits + 2) * z;
+
+  // Assign memory for the things that we need
+  // We know that the first block rows have the largest number of non-zero
+  // entries, so the largest layer will be for the first block rows. In
+  // particular, for both base graphs, the second row is of longest length.
+  uint32_t mat_size = graph->row_start_inds[graph->nrows] * z;
+  uint32_t layer_size =
+      (graph->row_start_inds[2] - graph->row_start_inds[1]) * z;
+  // We need to keep a record of matrix L (variable-to-check-node messages)
+  auto l = allocate_uninitialized<int16_t>(allocator, layer_size);
+  // We need to keep a record of matrix R (check-to-variable-node messages)
+  auto r = allocate_zeroed<int16_t>(allocator, mat_size);
+
+  auto row_min_array = allocate_zeroed<int16_t>(allocator, z);
+  auto row_min2_array = allocate_zeroed<int16_t>(allocator, z);
+  auto row_sign_array = allocate_zeroed<int16_t>(allocator, z);
+
+  auto check = allocate_zeroed<int16_t>(allocator, z);
+
+  // Scalar CAT_TINY tails are less efficient than processing as single,
+  // partial vector instruction. This is simply disabled currently.
+  bool z_is_tiny = 0;
+
+  // Keep a record of the current, and previous values of the LLRs
+  // Copy the inputs LLRs
+  const auto *llrs_ptr = llrs;
+  size_t new_llrs_size = num_llrs;
+  std::optional<unique_ptr<Allocator, int16_t>> maybe_out_llrs;
+  if (!z_is_tiny) {
+    // Double the storage required to replicate LLRs for optimization
+    new_llrs_size *= 2;
+    // Extra buffer to pack the LLRs again
+    maybe_out_llrs = allocate_uninitialized<int16_t>(allocator, num_llrs);
+  }
+  auto new_llrs = allocate_uninitialized<int16_t>(allocator, new_llrs_size);
+
+  // NOTE: All allocations are now done!
+  if constexpr (Allocator::is_counting) {
+    return;
+  }
+
+  if (z_is_tiny) {
+    // Set the value of the current LLRs from the ones passed in.
+    // We need to take account of the punctured columns.
+    // Also widen to int16_t for use in intermediate calculations.
+    memset(new_llrs.get(), 0, 2 * z * sizeof(int16_t));
+    for (uint32_t i = 0; i < z * graph->ncodeword_bits; i++) {
+      new_llrs[2 * z + i] = (int16_t)llrs[i];
+    }
+  } else {
+    // Each block of Z elements replicated b1|b1|b2|b2 ...
+    // We need to take account of the punctured columns.
+    // Also widen to int16_t for use in intermediate calculations.
+    memset(new_llrs.get(), 0, 4 * z * sizeof(int16_t));
+    auto *new_llrs_ptr = &new_llrs[4 * z];
+    for (uint32_t num_block = 0; num_block < graph->ncodeword_bits;
+         num_block++) {
+      for (uint32_t i = 0; i < z; i++) {
+        new_llrs_ptr[i] = (int16_t)llrs_ptr[i];
+        new_llrs_ptr[z + i] = (int16_t)llrs_ptr[i];
+      }
+      new_llrs_ptr += 2 * z;
+      llrs_ptr += z;
+    }
+  }
+
+  // Precompute number of full vector and tail
+  int32_t num_lanes = hn::Lanes(di16);
+  int32_t full_vec = z / num_lanes;
+  uint32_t tail_size = z % num_lanes;
+  bool is_tail_only = (tail_size == z && !z_is_tiny);
+
+  if (z_is_tiny) {
+    run_iterations<CAT_TINY>(num_its, z, lsi, graph, r.get(), l.get(),
+                             new_llrs.get(), num_lanes, full_vec, tail_size,
+                             row_min_array.get(), row_min2_array.get(),
+                             row_sign_array.get(), check.get(),
+                             check_convergence, maybe_crc_checker);
+
+    // Hard decode into the output variable
+    llrs_to_bits(num_llrs, new_llrs.get(), data_out);
+  } else {
+    if (is_tail_only) {
+      run_iterations<CAT_TAIL>(num_its, z, lsi, graph, r.get(), l.get(),
+                               new_llrs.get(), num_lanes, full_vec, tail_size,
+                               row_min_array.get(), row_min2_array.get(),
+                               row_sign_array.get(), check.get(),
+                               check_convergence, maybe_crc_checker);
+    } else {
+      run_iterations<CAT_LARGE>(num_its, z, lsi, graph, r.get(), l.get(),
+                                new_llrs.get(), num_lanes, full_vec, tail_size,
+                                row_min_array.get(), row_min2_array.get(),
+                                row_sign_array.get(), check.get(),
+                                check_convergence, maybe_crc_checker);
+    }
+    // Pack LLRs, copy back to original storage
+    auto *out_llrs = maybe_out_llrs.value().get();
+    for (uint32_t num_block = 0; num_block < graph->ncodeword_bits + 2;
+         num_block++) {
+      memcpy(out_llrs + num_block * z, &new_llrs[2 * num_block * z],
+             z * sizeof(int16_t));
+    }
+
+    // Hard decode into the output variable
+    llrs_to_bits(num_llrs, out_llrs, data_out);
+  }
+}
+
+template void armral::ldpc::decode_block<false, heap_allocator>(
+    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
+    uint32_t num_its, uint8_t *data_out, heap_allocator &);
+
+template void armral::ldpc::decode_block<false, buffer_bump_allocator>(
+    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
+    uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &);
+
+armral_status armral_ldpc_decode_block(const int8_t *llrs,
+                                       armral_ldpc_graph_t bg, uint32_t z,
+                                       uint32_t crc_idx, uint32_t num_its,
+                                       uint8_t *data_out) {
+  heap_allocator allocator{};
+  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
+                                   allocator);
+  return ARMRAL_SUCCESS;
+}
+
+armral_status
+armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg,
+                                 uint32_t z, uint32_t crc_idx, uint32_t num_its,
+                                 uint8_t *data_out, void *buffer) {
+  buffer_bump_allocator allocator{buffer};
+  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
+                                   allocator);
+  return ARMRAL_SUCCESS;
+}
+
+uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
+                                                      uint32_t z,
+                                                      uint32_t crc_idx,
+                                                      uint32_t num_its) {
+  counting_allocator allocator{};
+  armral::ldpc::decode_block<true>(nullptr, bg, z, crc_idx, num_its, nullptr,
+                                   allocator);
+  return allocator.required_bytes();
+}
diff --git a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
new file mode 100644
index 0000000..0af220b
--- /dev/null
+++ b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
@@ -0,0 +1,813 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#include "../ldpc_coding.hpp"
+#include "../ldpc_tables.hpp"
+#include "utils/allocators.hpp"
+#include "utils/bits_to_bytes.hpp"
+
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+namespace {
+// clang-format on
+
+inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
+                                          const uint8_t *parity_hdsm,
+                                          const uint8_t *agg_parity,
+                                          uint8_t *codeword) {
+
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    Vec_u8 agg0 = hn::LoadU(du8, ptr_agg);
+    Vec_u8 agg1 = hn::LoadU(du8, ptr_agg + 1);
+    Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+    Vec_u8 hdsm2z = hn::LoadU(du8, ptr_hdsm + 2 * z);
+    Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result23 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg1));
+    Vec_u8 result25 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreU(agg0, du8, data_out + 22 * z);
+    hn::StoreU(result23, du8, data_out + 23 * z);
+    hn::StoreU(result24, du8, data_out + 24 * z);
+    hn::StoreU(result25, du8, data_out + 25 * z);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    Mask_u8 pg = hn::FirstN(du8, tail_size);
+    Vec_u8 agg0 = no_sanitize::MaskedLoad(pg, du8, ptr_agg);
+    Vec_u8 agg1 = no_sanitize::MaskedLoad(pg, du8, ptr_agg + 1);
+    Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+    Vec_u8 hdsm2z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 2 * z);
+    Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result23 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg1));
+    Vec_u8 result25 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreN(agg0, du8, data_out + 22 * z, tail_size);
+    hn::StoreN(result23, du8, data_out + 23 * z, tail_size);
+    hn::StoreN(result24, du8, data_out + 24 * z, tail_size);
+    hn::StoreN(result25, du8, data_out + 25 * z, tail_size);
+  }
+
+  // Process the final row
+  {
+    codeword[(23 * z) - 1] = agg_parity[z - 1];
+    codeword[(24 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(25 * z) - 1] =
+        parity_hdsm[3 * z - 1] ^ parity_hdsm[4 * z - 1] ^ agg_parity[0];
+    codeword[(26 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+}
+
+inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
+                                      const uint8_t *agg_parity,
+                                      uint8_t *codeword) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  if (z == 208) {
+    uint8_t *data_out = codeword;
+    const uint8_t *ptr_agg = agg_parity;
+    const uint8_t *ptr_hdsm = parity_hdsm;
+    // zb = 0 to 104
+    int32_t full_vectors = 105 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      Vec_u8 agg103 = hn::LoadU(du8, ptr_agg + 103);
+      Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+      Vec_u8 hdsm2z = hn::LoadU(du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg103);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg103));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg103);
+
+      // Store parity bits
+      hn::StoreU(agg103, du8, data_out + 22 * z);
+      hn::StoreU(result23, du8, data_out + 23 * z);
+      hn::StoreU(result24, du8, data_out + 24 * z);
+      hn::StoreU(result25, du8, data_out + 25 * z);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = 105 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      Mask_u8 pg = hn::FirstN(du8, tail_size);
+      Vec_u8 agg103 = no_sanitize::MaskedLoad(pg, du8, ptr_agg + 103);
+      Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+      Vec_u8 hdsm2z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg103);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg103));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg103);
+
+      // Store parity bits
+      hn::StoreN(agg103, du8, data_out + 22 * z, tail_size);
+      hn::StoreN(result23, du8, data_out + 23 * z, tail_size);
+      hn::StoreN(result24, du8, data_out + 24 * z, tail_size);
+      hn::StoreN(result25, du8, data_out + 25 * z, tail_size);
+
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+    // Process  zb = 105 to 207
+    full_vectors = 103 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+
+      // Load inputs
+      Vec_u8 agg105 = hn::LoadU(du8, ptr_agg - 105);
+      Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+      Vec_u8 hdsm2z = hn::LoadU(du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg105);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg105));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg105);
+
+      // Store parity bits
+      hn::StoreU(agg105, du8, data_out + 22 * z);
+      hn::StoreU(result23, du8, data_out + 23 * z);
+      hn::StoreU(result24, du8, data_out + 24 * z);
+      hn::StoreU(result25, du8, data_out + 25 * z);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    tail_size = 103 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      Mask_u8 pg = hn::FirstN(du8, tail_size);
+      Vec_u8 agg105 = no_sanitize::MaskedLoad(pg, du8, ptr_agg - 105);
+      Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+      Vec_u8 hdsm2z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg105);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg105));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg105);
+
+      // Store parity bits
+      hn::StoreN(agg105, du8, data_out + 22 * z, tail_size);
+      hn::StoreN(result23, du8, data_out + 23 * z, tail_size);
+      hn::StoreN(result24, du8, data_out + 24 * z, tail_size);
+      hn::StoreN(result25, du8, data_out + 25 * z, tail_size);
+    }
+  } else { // z != 208
+
+    // Process the first row of the loop (zb =0)
+    {
+      codeword[22 * z] = agg_parity[z - 1];
+      codeword[23 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+      codeword[24 * z] =
+          parity_hdsm[2 * z] ^ parity_hdsm[3 * z] ^ agg_parity[z - 1];
+      codeword[25 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+    }
+
+    // Process zb = 1 to z
+    uint8_t *data_out = codeword + 1;
+    const uint8_t *ptr_agg = agg_parity + 1;
+    const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+    int32_t full_vectors = (z - 1) / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+
+      // Load inputs
+      Vec_u8 agg1 = hn::LoadU(du8, ptr_agg - 1);
+      Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+      Vec_u8 hdsm2z = hn::LoadU(du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg1);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg1));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg1);
+
+      // Store parity bits
+      hn::StoreU(agg1, du8, data_out + 22 * z);
+      hn::StoreU(result23, du8, data_out + 23 * z);
+      hn::StoreU(result24, du8, data_out + 24 * z);
+      hn::StoreU(result25, du8, data_out + 25 * z);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      Mask_u8 pg = hn::FirstN(du8, tail_size);
+      Vec_u8 agg1 = no_sanitize::MaskedLoad(pg, du8, ptr_agg - 1);
+      Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+      Vec_u8 hdsm2z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 2 * z);
+      Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      Vec_u8 result23 = hn::Xor(hdsm0, agg1);
+      Vec_u8 result24 = hn::Xor(hdsm2z, hn::Xor(hdsm3z, agg1));
+      Vec_u8 result25 = hn::Xor(hdsm3z, agg1);
+
+      // Store parity bits
+      hn::StoreN(agg1, du8, data_out + 22 * z, tail_size);
+      hn::StoreN(result23, du8, data_out + 23 * z, tail_size);
+      hn::StoreN(result24, du8, data_out + 24 * z, tail_size);
+      hn::StoreN(result25, du8, data_out + 25 * z, tail_size);
+    }
+  }
+}
+
+inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
+                                                const uint8_t *parity_hdsm,
+                                                const uint8_t *agg_parity,
+                                                uint8_t *codeword) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  // Process the first row of the loop (zb =0)
+  {
+    codeword[10 * z] = agg_parity[z - 1];
+    codeword[11 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+    codeword[12 * z] = parity_hdsm[0] ^ parity_hdsm[z] ^ agg_parity[z - 1];
+    codeword[13 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+  }
+
+  uint8_t *data_out = codeword + 1;
+  const uint8_t *ptr_agg = agg_parity + 1;
+  const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+
+    // Load inputs
+    Vec_u8 agg1 = hn::LoadU(du8, ptr_agg - 1);
+    Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+    Vec_u8 hdsmz = hn::LoadU(du8, ptr_hdsm + z);
+    Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result11 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result12 = hn::Xor(hdsm0, hn::Xor(hdsmz, agg1));
+    Vec_u8 result13 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreU(agg1, du8, data_out + 10 * z);
+    hn::StoreU(result11, du8, data_out + 11 * z);
+    hn::StoreU(result12, du8, data_out + 12 * z);
+    hn::StoreU(result13, du8, data_out + 13 * z);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    Mask_u8 pg = hn::FirstN(du8, tail_size);
+    Vec_u8 agg1 = no_sanitize::MaskedLoad(pg, du8, ptr_agg - 1);
+    Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+    Vec_u8 hdsmz = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + z);
+    Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result11 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result12 = hn::Xor(hdsm0, hn::Xor(hdsmz, agg1));
+    Vec_u8 result13 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreN(agg1, du8, data_out + 10 * z, tail_size);
+    hn::StoreN(result11, du8, data_out + 11 * z, tail_size);
+    hn::StoreN(result12, du8, data_out + 12 * z, tail_size);
+    hn::StoreN(result13, du8, data_out + 13 * z, tail_size);
+  }
+}
+
+inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
+                                           const uint8_t *parity_hdsm,
+                                           const uint8_t *agg_parity,
+                                           uint8_t *codeword) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    // Load inputs
+    Vec_u8 agg0 = hn::LoadU(du8, ptr_agg);
+    Vec_u8 agg1 = hn::LoadU(du8, ptr_agg + 1);
+    Vec_u8 hdsm0 = hn::LoadU(du8, ptr_hdsm);
+    Vec_u8 hdsmz = hn::LoadU(du8, ptr_hdsm + z);
+    Vec_u8 hdsm3z = hn::LoadU(du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result11 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result12 = hn::Xor(hdsm0, hn::Xor(hdsmz, agg1));
+    Vec_u8 result13 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreU(agg0, du8, data_out + 10 * z);
+    hn::StoreU(result11, du8, data_out + 11 * z);
+    hn::StoreU(result12, du8, data_out + 12 * z);
+    hn::StoreU(result13, du8, data_out + 13 * z);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    Mask_u8 pg = hn::FirstN(du8, tail_size);
+    Vec_u8 agg0 = no_sanitize::MaskedLoad(pg, du8, ptr_agg);
+    Vec_u8 agg1 = no_sanitize::MaskedLoad(pg, du8, ptr_agg + 1);
+    Vec_u8 hdsm0 = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm);
+    Vec_u8 hdsmz = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + z);
+    Vec_u8 hdsm3z = no_sanitize::MaskedLoad(pg, du8, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    Vec_u8 result11 = hn::Xor(hdsm0, agg1);
+    Vec_u8 result12 = hn::Xor(hdsm0, hn::Xor(hdsmz, agg1));
+    Vec_u8 result13 = hn::Xor(hdsm3z, agg1);
+
+    // Store parity bits
+    hn::StoreN(agg0, du8, data_out + 10 * z, tail_size);
+    hn::StoreN(result11, du8, data_out + 11 * z, tail_size);
+    hn::StoreN(result12, du8, data_out + 12 * z, tail_size);
+    hn::StoreN(result13, du8, data_out + 13 * z, tail_size);
+  }
+
+  // Process the final row outside of the loop
+  {
+    codeword[(11 * z) - 1] = agg_parity[z - 1];
+    codeword[(12 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(13 * z) - 1] =
+        parity_hdsm[z - 1] ^ parity_hdsm[2 * z - 1] ^ agg_parity[0];
+    codeword[(14 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+}
+
+// Set parity for base graph 1
+inline void set_parity_hdsm_bg1(uint32_t z, uint32_t lsi,
+                                const uint8_t *parity_hdsm,
+                                const uint8_t *agg_parity, uint8_t *codeword) {
+  if (lsi == 6) {
+    set_parity_hdsm_bg1_lsi_6(z, parity_hdsm, agg_parity, codeword);
+  } else {
+    set_parity_hdsm_bg1_lsi_not_6(z, parity_hdsm, agg_parity, codeword);
+  }
+}
+
+// Set parity for base graph 2
+inline void set_parity_hdsm_bg2(uint32_t z, uint32_t lsi,
+                                const uint8_t *parity_hdsm,
+                                const uint8_t *agg_parity, uint8_t *codeword) {
+  if ((lsi == 3) || (lsi == 7)) {
+    set_parity_hdsm_bg2_lsi_3_or_7(z, parity_hdsm, agg_parity, codeword);
+  } else {
+    set_parity_hdsm_bg2_lsi_not_3_nor_7(z, parity_hdsm, agg_parity, codeword);
+  }
+}
+
+// For a given base graph, and some intermediate results, performs the
+// multiplication by the inverse of the matrix required to get the parity bits
+// from the high-density sub matrix. We pass in the parity of the message bits
+// multiplied by part of the high-density sub matrix in parity_hdsm. The
+// array agg_parity is the aggregated parity of bits in parity_hdsm across
+// blocks of size z. This pattern is used a lot in the inversion for all of the
+// encodings. The codeword contains the message bits at the start, and after
+// these we set the parity bits in the 4 * z positions after the message bits.
+inline void calc_hdsm_parity(uint32_t z, uint32_t lsi, armral_ldpc_graph_t bg,
+                             const armral_ldpc_base_graph_t *graph,
+                             const uint8_t *parity_hdsm,
+                             const uint8_t *agg_parity, uint8_t *codeword) {
+  switch (bg) {
+  case LDPC_BASE_GRAPH_1:
+    set_parity_hdsm_bg1(z, lsi, parity_hdsm, agg_parity, codeword);
+    break;
+  case LDPC_BASE_GRAPH_2:
+    set_parity_hdsm_bg2(z, lsi, parity_hdsm, agg_parity, codeword);
+    break;
+  }
+}
+
+// Dimensions of base graph 1
+constexpr uint32_t bg1_num_rows = 46;
+constexpr uint32_t bg1_message_bits = 22;
+constexpr uint32_t bg1_codeword_bits = 66;
+
+// Dimensions of base graph 2
+constexpr uint32_t bg2_num_rows = 42;
+constexpr uint32_t bg2_message_bits = 10;
+constexpr uint32_t bg2_codeword_bits = 50;
+
+const armral_ldpc_base_graph_t base_graph_1{bg1_num_rows,      bg1_message_bits,
+                                            bg1_codeword_bits, bg1_row_start,
+                                            bg1_columns,       bg1_shifts};
+
+const armral_ldpc_base_graph_t base_graph_2{bg2_num_rows,      bg2_message_bits,
+                                            bg2_codeword_bits, bg2_row_start,
+                                            bg2_columns,       bg2_shifts};
+
+inline void calc_extension_parity(uint32_t z, uint32_t lsi,
+                                  const armral_ldpc_base_graph_t *graph,
+                                  uint8_t *codeword) {
+  auto max_ind = graph->nmessage_bits + 4;
+
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  for (uint32_t i = 4; i < graph->nrows; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then the lifting set index
+    // is added to this
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < max_ind; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = codeword + z * (graph->nmessage_bits + i);
+      auto *codeword_ptr = codeword + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        Vec_u8 reg1 = hn::LoadU(du8, out_ptr);
+        Vec_u8 reg2 = hn::LoadU(du8, codeword_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreU(result, du8, out_ptr);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        Mask_u8 pg = hn::FirstN(du8, tail_size);
+        Vec_u8 reg1 = no_sanitize::MaskedLoad(pg, du8, out_ptr);
+        Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, codeword_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreN(result, du8, out_ptr, tail_size);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      codeword_ptr = codeword + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        Vec_u8 reg1 = hn::LoadU(du8, out_ptr);
+        Vec_u8 reg2 = hn::LoadU(du8, codeword_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreU(result, du8, out_ptr);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        Mask_u8 pg = hn::FirstN(du8, tail_size);
+        Vec_u8 reg1 = no_sanitize::MaskedLoad(pg, du8, out_ptr);
+        Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, codeword_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreN(result, du8, out_ptr, tail_size);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j == col_entries - 1);
+  }
+}
+
+inline void spmv_hdsm(uint32_t z, uint32_t lsi,
+                      const armral_ldpc_base_graph_t *graph, uint8_t *bytes_in,
+                      uint8_t *parity_hdsm) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+
+  for (uint32_t i = 0; i < 4; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < graph->nmessage_bits; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = parity_hdsm + z * i;
+      auto *in_ptr = bytes_in + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        Vec_u8 reg1 = hn::LoadU(du8, out_ptr);
+        Vec_u8 reg2 = hn::LoadU(du8, in_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreU(result, du8, out_ptr);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        Mask_u8 pg = hn::FirstN(du8, tail_size);
+        Vec_u8 reg1 = no_sanitize::MaskedLoad(pg, du8, out_ptr);
+        Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, in_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreN(result, du8, out_ptr, tail_size);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      in_ptr = bytes_in + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        Vec_u8 reg1 = hn::LoadU(du8, out_ptr);
+        Vec_u8 reg2 = hn::LoadU(du8, in_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreU(result, du8, out_ptr);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        Mask_u8 pg = hn::FirstN(du8, tail_size);
+        Vec_u8 reg1 = no_sanitize::MaskedLoad(pg, du8, out_ptr);
+        Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, in_ptr);
+        Vec_u8 result = hn::Xor(reg1, reg2);
+        hn::StoreN(result, du8, out_ptr, tail_size);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j < col_entries && col_ptr[j] >= graph->nmessage_bits);
+  }
+}
+
+inline void copy_input_message(uint32_t z,
+                               const armral_ldpc_base_graph_t *graph,
+                               const uint8_t *bytes_in, uint8_t *codeword) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+  int32_t full_vectors = z / num_lanes;
+  int32_t tail_size = z - (full_vectors * num_lanes);
+
+  for (uint32_t j = 0; j < graph->nmessage_bits; ++j) {
+    uint8_t *out_ptr = codeword + j * z;
+    const uint8_t *in_ptr = bytes_in + j * z;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      Vec_u8 reg = hn::LoadU(du8, in_ptr);
+      hn::StoreU(reg, du8, out_ptr);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      Mask_u8 pg = hn::FirstN(du8, tail_size);
+      Vec_u8 reg = no_sanitize::MaskedLoad(pg, du8, in_ptr);
+      hn::StoreN(reg, du8, out_ptr, tail_size);
+    }
+  }
+}
+
+inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
+                          uint8_t *tmp_parity) {
+  using Mask_u8 = hn::Mask<decltype(du8)>;
+
+  int32_t num_lanes = hn::Lanes(du8);
+  int32_t full_vectors = z / num_lanes;
+
+  // First iteration, tmp_parity is vector of 0
+  uint8_t *out_ptr = tmp_parity;
+  const uint8_t *in_ptr = parity_hdsm;
+  Vec_u8 reg1 = hn::Set(du8, 0);
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    Vec_u8 reg2 = hn::LoadU(du8, in_ptr);
+    Vec_u8 result = hn::Xor(reg1, reg2);
+    hn::StoreU(result, du8, out_ptr);
+    out_ptr += num_lanes;
+    in_ptr += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = z - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    Mask_u8 pg = hn::FirstN(du8, tail_size);
+    Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, in_ptr);
+    Vec_u8 result = hn::Xor(reg1, reg2);
+    hn::StoreN(result, du8, out_ptr, tail_size);
+  }
+
+  // Iteration 1 to 3
+  for (uint32_t j = 1; j < 4; ++j) {
+    out_ptr = tmp_parity;
+    in_ptr = parity_hdsm + z * j;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      reg1 = hn::LoadU(du8, out_ptr);
+      Vec_u8 reg2 = hn::LoadU(du8, in_ptr);
+      Vec_u8 result = hn::Xor(reg1, reg2);
+      hn::StoreU(result, du8, out_ptr);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      Mask_u8 pg = hn::FirstN(du8, tail_size);
+      reg1 = no_sanitize::MaskedLoad(pg, du8, out_ptr);
+      Vec_u8 reg2 = no_sanitize::MaskedLoad(pg, du8, in_ptr);
+      Vec_u8 result = hn::Xor(reg1, reg2);
+      hn::StoreN(result, du8, out_ptr, tail_size);
+    }
+  }
+}
+
+} // anonymous namespace
+
+namespace armral::ldpc {
+
+uint32_t get_lifting_index(uint32_t lifting_size) {
+  // Each lifting size is either a power of two,
+  // or an odd multiple (up to 15) of a power of two. Find the first odd
+  // number when shifting right,
+  // e.g. (112 -> 56 -> 28 -> 14 -> 7)
+  // then divide that by two to get the index from
+  // the mapping:
+  // 2 -> 0
+  // 3 -> 1
+  // 5 -> 2
+  // 7 -> 3
+  // 9 -> 4
+  // 11 -> 5
+  // 13 -> 6
+  // 15 -> 7
+  // Using the example above, 112 would then be mapped onto index set 3
+  assert(lifting_size > 0);
+  auto lifting_set_index = lifting_size >> __builtin_ctz(lifting_size);
+  assert(lifting_set_index <= 15);
+  lifting_set_index >>= 1;
+  return lifting_set_index;
+}
+
+template<typename Allocator>
+armral_status encode_block(const uint8_t *data_in, armral_ldpc_graph_t bg,
+                           uint32_t z, uint32_t len_filler_bits,
+                           uint8_t *data_out, Allocator &allocator) {
+
+  // Get a pointer to the graph to be working with
+  const auto *graph = armral_ldpc_get_base_graph(bg);
+  assert(graph);
+
+  auto bytes_in = allocate_zeroed<uint8_t>(allocator, z * graph->nmessage_bits);
+  auto parity_hdsm = allocate_zeroed<uint8_t>(allocator, 4 * z);
+  auto codeword =
+      allocate_zeroed<uint8_t>(allocator, (graph->ncodeword_bits + 2) * z);
+  auto tmp_parity = allocate_zeroed<uint8_t>(allocator, z);
+
+  if constexpr (Allocator::is_counting) {
+    return ARMRAL_SUCCESS;
+  }
+
+  // Cast the bits to bytes for easier handling of data,
+  // ignore filler bits if present
+  bits_to_bytes(z * graph->nmessage_bits - len_filler_bits, data_in,
+                bytes_in.get());
+
+  // Get the lifting set index
+  auto lsi = get_lifting_index(z);
+
+  // The encoding is done by computing:
+  // 1- Parity bits for the high-density submatrix (hdsm)
+  // 2- Parity bits for the extension matrix
+
+  // 1- Encoding of the high-density submatrix hdsm
+  // Multiply input message bit by parity-check matrix H
+  spmv_hdsm(z, lsi, graph, bytes_in.get(), parity_hdsm.get());
+
+  // Copy input message bits to the codeword (output)
+  copy_input_message(z, graph, bytes_in.get(), codeword.get());
+
+  // Build the right-hand side of the linear systems
+  // to solve for hdsm parity computation
+  calc_hdsm_rhs(z, parity_hdsm.get(), tmp_parity.get());
+
+  // Finally, computation of hdsm parity bits
+  calc_hdsm_parity(z, lsi, bg, graph, parity_hdsm.get(), tmp_parity.get(),
+                   codeword.get());
+
+  // // 2- Parity bits for the extension matrix.
+  // // It involves the sparse matrix vector multiplication on the remaining
+  // // rows of the matrix and writing into the codeword at the same row
+  // // as we are currently working on
+  calc_extension_parity(z, lsi, graph, codeword.get());
+
+  // // Now convert the bytes back to bits.
+  // // Puncturing is performed by removing 2*z elts
+  // // From the codeword. It is done very late because
+  // // the first two columns of the input message are
+  // // involved in all the parity bit computations.
+  bytes_to_bits(z * graph->ncodeword_bits, &codeword[2 * z], data_out);
+
+  return ARMRAL_SUCCESS;
+}
+
+} // namespace armral::ldpc
+
+armral_status armral_ldpc_encode_block(const uint8_t *data_in,
+                                       armral_ldpc_graph_t bg, uint32_t z,
+                                       uint32_t len_filler_bits,
+                                       uint8_t *data_out) {
+  heap_allocator allocator{};
+  return armral::ldpc::encode_block(data_in, bg, z, len_filler_bits, data_out,
+                                    allocator);
+}
+
+armral_status
+armral_ldpc_encode_block_noalloc(const uint8_t *data_in, armral_ldpc_graph_t bg,
+                                 uint32_t z, uint32_t len_filler_bits,
+                                 uint8_t *data_out, void *buffer) {
+  buffer_bump_allocator allocator{buffer};
+  return armral::ldpc::encode_block(data_in, bg, z, len_filler_bits, data_out,
+                                    allocator);
+}
+
+uint32_t
+armral_ldpc_encode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, uint32_t z,
+                                             uint32_t len_filler_bits) {
+  counting_allocator allocator{};
+  (void)armral::ldpc::encode_block(nullptr, bg, z, len_filler_bits, nullptr,
+                                   allocator);
+  return allocator.required_bytes();
+}
+
+const armral_ldpc_base_graph_t *
+armral_ldpc_get_base_graph(armral_ldpc_graph_t bg) {
+  return bg == LDPC_BASE_GRAPH_1 ? &base_graph_1 : &base_graph_2;
+}
diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp
index 5e97e35..ff205cf 100644
--- a/src/UpperPHY/LDPC/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp
@@ -4,6 +4,7 @@
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
+#include "ldpc_tables.hpp"
 #include "utils/allocators.hpp"
 #include "utils/bits_to_bytes.hpp"
 
@@ -16,934 +17,6 @@
 #include <cstring>
 
 namespace {
-// The base graphs are given in compressed sparse row format. We need three
-// arrays for this.  Firstly we have the row start indices, which stores the
-// indices into another array which indicates where the row starts. The next
-// array stores the indices of the columns which are non-zero in a row. Finally,
-// we have an array of values corresponding to the non-zero entries in the
-// matrix.
-// For example, `bg1_row_start[3]` is the index into `bg1_columns` for the
-// start of the fourth row, and `bg1_columns[bg1_row_starts[3]]` is the index of
-// a column in the fourth row of the matrix which contains a non-zero value.
-
-// Base graph 1 is taken from 3GPP standard document 38.212 table 5.3.2-2.
-
-// The row start indices for base graph 1
-const uint32_t bg1_row_start[] = {
-    0,   19,  38,  57,  76,  79,  87,  96,  103, 113, 122, 129,
-    137, 144, 150, 157, 164, 170, 176, 182, 188, 194, 200, 205,
-    210, 216, 221, 226, 230, 235, 240, 245, 250, 255, 260, 265,
-    270, 275, 279, 284, 289, 293, 298, 302, 307, 312, 316};
-
-// clang-format off
-const uint32_t bg1_columns[] = {
-    0,  1,  2,  3,  5,  6,  9,  10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 23, // row 0: 19
-    0,  2,  3,  4,  5,  7,  8,  9,  11, 12, 14, 15, 16, 17, 19, 21, 22, 23, 24, // row 1: 19
-    0,  1,  2,  4,  5,  6,  7,  8,  9,  10, 13, 14, 15, 17, 18, 19, 20, 24, 25, // row 2: 19
-    0,  1,  3,  4,  6,  7,  8,  10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 25, // row 3: 19
-    0,  1,  26,                             // row 4: 3
-    0,  1,  3,  12, 16, 21, 22, 27,         // row 5: 8
-    0,  6,  10, 11, 13, 17, 18, 20, 28,     // row 6: 9
-    0,  1,  4,  7,  8,  14, 29,             // row 7: 7
-    0,  1,  3,  12, 16, 19, 21, 22, 24, 30, // row 8: 10
-    0,  1,  10, 11, 13, 17, 18, 20, 31,     // row 9: 9
-    1,  2,  4,  7,  8,  14, 32,             // row 10: 7
-    0,  1,  12, 16, 21, 22, 23, 33,         // row 11: 8
-    0,  1,  10, 11, 13, 18, 34,             // row 12: 7
-    0,  3,  7,  20, 23, 35,                 // row 13: 6
-    0,  12, 15, 16, 17, 21, 36,             // row 14: 7
-    0,  1,  10, 13, 18, 25, 37,             // row 15: 7
-    1,  3,  11, 20, 22, 38,                 // row 16: 6
-    0,  14, 16, 17, 21, 39,                 // row 17: 6
-    1,  12, 13, 18, 19, 40,                 // row 18: 6
-    0,  1,  7,  8,  10, 41,                 // row 19: 6
-    0,  3,  9,  11, 22, 42,                 // row 20: 6
-    1,  5,  16, 20, 21, 43,                 // row 21: 6
-    0,  12, 13, 17, 44,                     // row 22: 5
-    1,  2,  10, 18, 45,                     // row 23: 5
-    0,  3,  4,  11, 22, 46,                 // row 24: 6
-    1,  6,  7,  14, 47,                     // row 25: 5
-    0,  2,  4,  15, 48,                     // row 26: 5
-    1,  6,  8,  49,                         // row 27: 4
-    0,  4,  19, 21, 50,                     // row 28: 5
-    1,  14, 18, 25, 51,                     // row 29: 5
-    0,  10, 13, 24, 52,                     // row 30: 5
-    1,  7,  22, 25, 53,                     // row 31: 5
-    0,  12, 14, 24, 54,                     // row 32: 5
-    1,  2,  11, 21, 55,                     // row 33: 5
-    0,  7,  15, 17, 56,                     // row 34: 5
-    1,  6,  12, 22, 57,                     // row 35: 5
-    0,  14, 15, 18, 58,                     // row 36: 5
-    1,  13, 23, 59,                         // row 37: 4
-    0,  9,  10, 12, 60,                     // row 38: 5
-    1,  3,  7,  19, 61,                     // row 39: 5
-    0,  8,  17, 62,                         // row 40: 4
-    1,  3,  9,  18, 63,                     // row 41: 5
-    0,  4,  24, 64,                         // row 42: 4
-    1,  16, 18, 25, 65,                     // row 43: 5
-    0,  7,  9,  22, 66,                     // row 44: 5
-    1,  6,  10, 67                          // row 45: 4
-};
-
-// The shifts are organized by row, and then by index set. Each line in the
-// following represents the shifts in one index set for one block row of the
-// matrix. Indexing into the array works as follows. If we are using index set k
-// for k in [0, 7], and are on block row i, then the indexing function from k, i
-// to j is ind(k, i) = 8 * bg1_row_start[i] + (bg1_row_start[i+1] -
-// bg1_row_start[i]) * k
-const uint32_t bg1_shifts[] = {
-    250, 69,  226, 159, 100, 10,  59,  229, 110, 191, 9,   195, 23,  190, 35, 239, 31,   1,   0, // row 0
-    307, 19,  50,  369, 181, 216, 317, 288, 109, 17,  357, 215, 106, 242, 180, 330, 346, 1,   0,
-    73,  15,  103, 49,  240, 39,  15,  162, 215, 164, 133, 298, 110, 113, 16,  189, 32,  1,   0,
-    223, 16,  94,  91,  74,  10,  0,   205, 216, 21,  215, 14,  70,  141, 198, 104, 81,  1,   0,
-    211, 198, 188, 186, 219, 4,   29,  144, 116, 216, 115, 233, 144, 95,  216, 73,  261, 1,   0,
-    294, 118, 167, 330, 207, 165, 243, 250, 1,   339, 201, 53,  347, 304, 167, 47,  188, 1,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    135, 227, 126, 134, 84,  83,  53,  225, 205, 128, 75,  135, 217, 220, 90,  105, 137, 1,   0,
-
-    2,   239, 117, 124, 71,  222, 104, 173, 220, 102, 109, 132, 142, 155, 255, 28,  0,   0,   0, // row 1
-    76,  76,  73,  288, 144, 331, 331, 178, 295, 342, 217, 99,  354, 114, 331, 112, 0,   0,   0,
-    303, 294, 27,  261, 161, 133, 4,   80,  129, 300, 76,  266, 72,  83,  260, 301, 0,   0,   0,
-    141, 45,  151, 46,  119, 157, 133, 87,  206, 93,  79,  9,   118, 194, 31,  187, 0,   0,   0,
-    179, 162, 223, 256, 160, 76,  202, 117, 109, 15,  72,  152, 158, 147, 156, 119, 0,   0,   0,
-    77,  225, 96,  338, 268, 112, 302, 50,  167, 253, 334, 242, 257, 133, 9,   302, 0,   0,   0,
-    22,  11,  124, 0,   10,  0,   0,   2,   16,  60,  0,   6,   30,  0,   168, 31,  105, 0,   0,
-    96,  236, 136, 221, 128, 92,  172, 56,  11,  189, 95,  85,  153, 87,  163, 216, 0,   0,   0,
-
-    106, 111, 185, 63,  117, 93,  229, 177, 95,  39,  142, 225, 225, 245, 205, 251, 117, 0,   0, // row 2
-    205, 250, 328, 332, 256, 161, 267, 160, 63,  129, 200, 88,  53,  131, 240, 205, 13,  0,   0,
-    68,  7,   80,  280, 38,  227, 202, 200, 71,  106, 295, 283, 301, 184, 246, 230, 276, 0,   0,
-    207, 203, 31,  176, 180, 186, 95,  153, 177, 70,  77,  214, 77,  198, 117, 223, 90,  0,   0,
-    258, 167, 220, 133, 243, 202, 218, 63,  0,   3,   74,  229, 0,   216, 269, 200, 234, 0,   0,
-    226, 35,  213, 302, 111, 265, 128, 237, 294, 127, 110, 286, 125, 131, 163, 210, 7,   0,   0,
-    132, 37,  21,  180, 4,   149, 48,  38,  122, 195, 155, 28,  85,  47,  179, 42,  66,  0,   0,
-    189, 4,   225, 151, 236, 117, 179, 92,  24,  68,  6,   101, 33,  96,  125, 67,  230, 0,   0,
-
-    121, 89,  84,  20,  150, 131, 243, 136, 86,  246, 219, 211, 240, 76,  244, 144, 12,  1,   0, // row 3
-    276, 87,  0,   275, 199, 153, 56,  132, 305, 231, 341, 212, 304, 300, 271, 39,  357, 1,   0,
-    220, 208, 30,  197, 61,  175, 79,  281, 303, 253, 164, 53,  44,  28,  77,  319, 68,  1,   0,
-    201, 18,  165, 5,   45,  142, 16,  34,  155, 213, 147, 69,  96,  74,  99,  30,  158, 1,   0,
-    187, 145, 166, 108, 82,  132, 197, 41,  162, 57,  36,  115, 242, 165, 0,   113, 108, 1,   0,
-    97,  94,  49,  279, 139, 166, 91,  106, 246, 345, 269, 185, 249, 215, 143, 121, 121, 1,   0,
-    4,   6,   33,  113, 49,  21,  6,   151, 83,  154, 87,  5,   92,  173, 120, 2,   142, 0,   0,
-    128, 23,  162, 220, 43,  186, 96,  1,   216, 22,  24,  167, 200, 32,  235, 172, 219, 1,   0,
-
-    157, 102, 0, // row 4
-    332, 181, 0,
-    233, 205, 0,
-    170, 10,  0,
-    246, 235, 0,
-    42,  256, 0,
-    24,  204, 0,
-    64,  211, 0,
-
-    205, 236, 194, 231, 28,  123, 115, 0, // row 5
-    195, 14,  115, 166, 241, 51,  157, 0,
-    83,  292, 50,  318, 201, 267, 279, 0,
-    164, 59,  86,  80,  182, 130, 153, 0,
-    261, 181, 72,  283, 254, 79,  144, 0,
-    219, 130, 251, 322, 295, 258, 283, 0,
-    185, 100, 24,  65,  207, 161, 72,  0,
-    2,   171, 47,  143, 210, 180, 180, 0,
-
-    183, 22,  28,  67,  244, 11,  157, 211, 0, // row 6
-    278, 257, 1,   351, 92,  253, 18,  225, 0,
-    289, 21,  293, 13,  232, 302, 138, 235, 0,
-    158, 119, 113, 21,  63,  51,  136, 116, 0,
-    80,  144, 169, 90,  59,  177, 151, 108, 0,
-    294, 73,  330, 99,  172, 150, 284, 305, 0,
-    6,   27,  163, 50,  48,  24,  38,  91,  0,
-    199, 22,  23,  100, 92,  207, 52,  13,  0,
-
-    220, 44,  159, 31,  167, 104, 0, // row 7
-    9,   62,  316, 333, 290, 114, 0,
-    12,  88,  207, 50,  25,  76,  0,
-    17,  76,  104, 100, 150, 158, 0,
-    169, 189, 154, 184, 104, 164, 0,
-    3,   103, 224, 297, 215, 39,  0,
-    145, 88,  112, 153, 159, 76,  0,
-    77,  146, 209, 32,  166, 18,  0,
-
-    112, 4,   7,   211, 102, 164, 109, 241, 90,  0, // row 8
-    307, 179, 165, 18,  39,  224, 368, 67,  170, 0,
-    295, 133, 130, 231, 296, 110, 269, 245, 154, 0,
-    33,  95,  4,   217, 204, 39,  58,  44,  201, 0,
-    54,  0,   252, 41,  98,  46,  15,  230, 54,  0,
-    348, 75,  22,  312, 224, 17,  59,  314, 244, 0,
-    172, 2,   131, 141, 96,  99,  101, 35,  116, 0,
-    181, 105, 141, 223, 177, 145, 199, 153, 38,  0,
-
-    103, 182, 109, 21,  142, 14,  61,  216, 0, // row 9
-    366, 232, 321, 133, 57,  303, 63,  82,  0,
-    189, 244, 36,  286, 151, 267, 135, 209, 0,
-    9,   37,  213, 105, 89,  185, 109, 218, 0,
-    162, 159, 93,  134, 45,  132, 76,  209, 0,
-    156, 88,  293, 111, 92,  152, 23,  337, 0,
-    6,   10,  145, 53,  201, 4,   164, 173, 0,
-    169, 12,  206, 221, 17,  212, 92,  205, 0,
-
-    98,  149, 167, 160, 49,  58,  0, // row 10
-    101, 339, 274, 111, 383, 354, 0,
-    14,  80,  211, 75,  161, 311, 0,
-    82,  165, 174, 19,  194, 103, 0,
-    178, 1,   28,  267, 234, 201, 0,
-    175, 253, 27,  231, 49,  267, 0,
-    126, 77,  156, 16,  12,  70,  0,
-    116, 151, 70,  230, 115, 84,  0,
-
-    77,  41,  83,  182, 78,  252, 22,  0, // row 11
-    48,  102, 8,   47,  188, 334, 115, 0,
-    16,  147, 290, 289, 177, 43,  280, 0,
-    52,  11,  2,   35,  32,  84,  201, 0,
-    55,  23,  274, 181, 273, 39,  26,  0,
-    25,  322, 200, 351, 166, 338, 192, 0,
-    184, 194, 123, 16,  104, 109, 124, 0,
-    45,  115, 134, 1,   152, 165, 107, 0,
-
-    160, 42,  21,  32,  234, 7,   0, // row 12
-    77,  186, 174, 232, 50,  74,  0,
-    229, 235, 169, 48,  105, 52,  0,
-    142, 175, 136, 3,   28,  182, 0,
-    225, 162, 244, 151, 238, 243, 0,
-    123, 217, 142, 110, 176, 76,  0,
-    6,   20,  203, 153, 104, 207, 0,
-    186, 215, 124, 180, 98,  80,  0,
-
-    177, 248, 151, 185, 62,  0, // row 13
-    313, 177, 266, 115, 370, 0,
-    39,  302, 303, 160, 37,  0,
-    81,  56,  72,  217, 78,  0,
-    231, 0,   216, 47,  36,  0,
-    311, 251, 265, 94,  81,  0,
-    52,  147, 1,   16,  46,  0,
-    220, 185, 154, 178, 150, 0,
-
-    206, 55,  206, 127, 16,  229, 0, // row 14
-    142, 248, 137, 89,  347, 12,  0,
-    78,  299, 54,  61,  179, 258, 0,
-    14,  175, 211, 191, 51,  43,  0,
-    0,   186, 253, 16,  0,   79,  0,
-    22,  322, 277, 156, 66,  78,  0,
-    1,   202, 118, 130, 1,   2,   0,
-    124, 144, 182, 95,  72,  76,  0,
-
-    40,  96,  65,  63,  75,  179, 0, // row 15
-    241, 2,   210, 318, 55,  269, 0,
-    229, 290, 60,  130, 184, 51,  0,
-    90, 120, 131, 209, 209, 81,  0,
-    170, 0,   183, 108, 68,  64,  0,
-    176, 348, 15,  81,  176, 113, 0,
-    173, 6,   81,  182, 53,  46,  0,
-    39,  138, 220, 173, 142, 49,  0,
-
-    64,  49,  49,  51,  154, 0, // row 16
-    13,  338, 57,  289, 57,  0,
-    69,  140, 45,  115, 300, 0,
-    154, 164, 43, 189, 101, 0,
-    270, 13,  99,  54,  0,   0,
-    190, 293, 332, 331, 114, 0,
-    88,  198, 160, 122, 182, 0,
-    78,  152, 84,  5,   205, 0,
-
-    7,   164, 59,  1,   144, 0, // row 17
-    260, 303, 81,  358, 375, 0,
-    257, 147, 128, 51,  228, 0,
-    56,  110, 200, 63,  4,   0,
-    153, 137, 0,   0,   162, 0,
-    110, 228, 247, 116, 190, 0,
-    91,  184, 30,  3,   155, 0,
-    183, 112, 106, 219, 129, 0,
-
-    42,  233, 8,   155, 147, 0, // row 18
-    130, 163, 280, 132, 4,   0,
-    260, 294, 291, 141, 295, 0,
-    199, 110, 200, 143, 186, 0,
-    161, 151, 0,   241, 144, 0,
-    47,  286, 246, 181, 73,  0,
-    1,   41,  167, 68,  148, 0,
-    183, 215, 180, 143, 14,  0,
-
-    60,  73,  72,  127, 224, 0, // row 19
-    145, 213, 344, 242, 197, 0,
-    64,  181, 101, 270, 41,  0,
-    8,   6,   103, 198, 8,   0,
-    0,   0,   118, 144, 0,   0,
-    87,  110, 147, 258, 204, 0,
-    12,  6,   166, 184, 191, 0,
-    179, 108, 159, 138, 196, 0,
-
-    151, 186, 217, 47,  160, 0, // row 20
-    187, 206, 264, 341, 59,  0,
-    301, 162, 40,  130, 10,  0,
-    105, 210, 121, 214, 183, 0,
-    265, 81,  90,  144, 228, 0,
-    89,  65,  155, 244, 30,  0,
-    6,   12,  15,  5,   30,  0,
-    77,  187, 203, 167, 130, 0,
-
-    249, 121, 109, 131, 171, 0, // row 21
-    205, 102, 328, 213, 97,  0,
-    79,  175, 132, 283, 103, 0,
-    192, 131, 220, 50,  106, 0,
-    64,  46,  266, 9,   18,  0,
-    162, 264, 346, 143, 109, 0,
-    6,   86,  96,  42,  199, 0,
-    197, 122, 215, 65,  216, 0,
-
-    64,  142, 188, 158, 0, // row 22
-    30,  11,  233, 22,  0,
-    177, 20,  55,  316, 0,
-    53,  0,   3,   148, 0,
-    72,  189, 72,  257, 0,
-    280, 157, 236, 113, 0,
-    44,  58,  130, 131, 0,
-    25,  47,  126, 178, 0,
-
-    156, 147, 170, 152, 0, // row 23
-    24,  89,  61,  27,  0,
-    249, 50,  133, 105, 0,
-    88,  203, 168, 122, 0,
-    180, 0,   0,   165, 0,
-    18,  6,   181, 304, 0,
-    45,  18,  132, 100, 0,
-    185, 127, 117, 199, 0,
-
-    112, 86,  236, 116, 222, 0, // row 24
-    298, 158, 235, 339, 234, 0,
-    289, 280, 110, 187, 281, 0,
-    49,  157, 64, 193, 124,  0,
-    236, 199, 0,   266, 0,   0,
-    38,  170, 249, 288, 194, 0,
-    9,   125, 191, 28,  6,   0,
-    32,  178, 2,   156, 58,  0,
-
-    23,  136, 116, 182, 0, // row 25
-    72,  17,  383, 312, 0,
-    172, 295, 96,  46,  0,
-    1,   166, 65,  81,  0,
-    205, 0,   0,   183, 0,
-    279, 255, 111, 54,  0,
-    4,   74,  16,  28,  0,
-    27,  141, 11,  181, 0,
-
-    195, 243, 215, 61,  0, // row 26
-    71,  81,  76,  136, 0,
-    270, 110, 318, 67,  0,
-    107, 176, 212, 127, 0,
-    0,   0,   0,   277, 0,
-    325, 326, 226, 99,  0,
-    21,  142, 192, 197, 0,
-    163, 131, 169, 98,  0,
-
-    25,  104, 194, 0, // row 27
-    194, 194, 101, 0,
-    210, 29,  304, 0,
-    208, 141, 174, 0,
-    45,  36,  72,  0,
-    91,  326, 268, 0,
-    98,  140, 22,  0,
-    165, 232, 9,   0,
-
-    128, 165, 181, 63,  0, // row 28
-    222, 19,  244, 274, 0,
-    11,  293, 50,  234, 0,
-    146, 153, 217, 114, 0,
-    275, 0,   155, 62,  0,
-    102, 1,   40,  167, 0,
-    4,   1,   40,  93,  0,
-    32,  43,  200, 205, 0,
-
-    86,  236, 84,  6,   0, // row 29
-    252, 5,   147, 78,  0,
-    27,  308, 117, 29,  0,
-    150, 11,  53,  68,  0,
-    0,   180, 0,   42,  0,
-    273, 104, 243, 107, 0,
-    92,  136, 106, 6,   0,
-    232, 32,  118, 103, 0,
-
-    216, 73,  120, 9,   0, // row 30
-    159, 229, 260, 90,  0,
-    91,  23,  105, 135, 0,
-    34,  130, 210, 123, 0,
-    0,   90,  252, 173, 0,
-    171, 16,  95,  212, 0,
-    2,   88,  112, 20,  0,
-    170, 199, 26,  105, 0,
-
-    95,  177, 172, 61,  0, // row 31
-    100, 215, 258, 256, 0,
-    222, 308, 66,  162, 0,
-    175, 49,  177, 128, 0,
-    144, 144, 166, 19,  0,
-    101, 297, 279, 222, 0,
-    4,   49,  125, 194, 0,
-    73,  149, 175, 108, 0,
-
-    221, 112, 199, 121, 0, // row 32
-    102, 201, 175, 287, 0,
-    210, 22,  271, 217, 0,
-    192, 209, 58,  30,  0,
-    0,   211, 36,  162, 0,
-    351, 265, 338, 83,  0,
-    6,   126, 63,  20,  0,
-    103, 110, 151, 211, 0,
-
-    2,   187, 41,  211, 0, // row 33
-    323, 8,   361, 105, 0,
-    170, 20,  140, 33,  0,
-    114, 49,  161, 137, 0,
-    0,   0,   76,  18,  0,
-    56,  304, 141, 101, 0,
-    10,  30,  6,   92,  0,
-    199, 132, 172, 65,  0,
-
-    127, 167, 164, 159, 0, // row 34
-    230, 148, 202, 312, 0,
-    187, 296, 5,   44,  0,
-    82,  186, 68,  150, 0,
-    197, 0,   108, 0,   0,
-    60,  320, 112, 54,  0,
-    4,   153, 197, 155, 0,
-    161, 237, 142, 180, 0,
-
-    161, 197, 207, 103, 0, // row 35
-    320, 335, 2,   266, 0,
-    207, 158, 55,  285, 0,
-    192, 173, 26,  187, 0,
-    199, 278, 0,   205, 0,
-    100, 210, 195, 268, 0,
-    4,   45,  168, 185, 0,
-    231, 174, 145, 100, 0,
-
-    37,  105, 51,  120, 0, // row 36
-    210, 313, 297, 21,  0,
-    259, 179, 178, 160, 0,
-    222, 157, 0,   6,   0,
-    216, 16,  0,   0,   0,
-    135, 15,  35,  188, 0,
-    6,   200, 177, 43,  0,
-    11,  207, 42,  100, 0,
-
-    198, 220, 122, 0, // row 37
-    269, 82,  115, 0,
-    298, 15,  115, 0,
-    81,  195, 138, 0,
-    72,  144, 0,   0,
-    319, 236, 85,  0,
-    82,  2,   135, 0,
-    59,  204, 161, 0,
-
-    167, 151, 157, 163, 0, // row 38
-    185, 177, 289, 214, 0,
-    151, 179, 64,  181, 0,
-    123, 90,  73,  10,  0,
-    190, 0,   0,   0,   0,
-    164, 196, 209, 246, 0,
-    91,  64,  198, 100, 0,
-    121, 90,  26,  140, 0,
-
-    173, 139, 149, 0,   0, // row 39
-    258, 93,  346, 297, 0,
-    102, 77,  192, 208, 0,
-    12,  77,  49,  114, 0,
-    153, 0,   165, 117, 0,
-    236, 264, 37,  272, 0,
-    4,   28,  109, 188, 0,
-    115, 188, 168, 52,  0,
-
-    157, 137, 149, 0, // row 40
-    175, 37,  312, 0,
-    32,  80,  197, 0,
-    67,  45,  96,  0,
-    216, 144, 2,   0,
-    304, 237, 135, 0,
-    10,  84,  12,  0,
-    4,   103, 30,  0,
-
-    167, 173, 139, 151, 0, // row 41
-    52,  314, 139, 288, 0,
-    154, 47,  124, 207, 0,
-    23,  215, 60,  167, 0,
-    0,   0,   0,   183, 0,
-    123, 77,  25,  272, 0,
-    2,   75,  142, 128, 0,
-    53,  189, 215, 24,  0,
-
-    149, 157, 137, 0, // row 42
-    113, 14,  218, 0,
-    226, 65,  126, 0,
-    114, 91,  78,  0,
-    27,  0,   35,  0,
-    288, 83,  17,  0,
-    163, 10,  162, 0,
-    222, 170, 71,  0,
-
-    151, 163, 173, 139, 0, // row 43
-    113, 132, 114, 168, 0,
-    228, 69,  176, 102, 0,
-    206, 22,  134, 161, 0,
-    52,  243, 0,   270, 0,
-    210, 3,   53,  167, 0,
-    1,   163, 99,  98,  0,
-    22,  127, 49,  125, 0,
-
-    139, 157, 163, 173, 0, // row 44
-    80,  78,  163, 274, 0,
-    234, 227, 259, 260, 0,
-    84,  4,   9,   12,  0,
-    18,  0,   0,   57,  0,
-    79,  244, 293, 272, 0,
-    4,   6,   142, 3,   0,
-    191, 211, 187, 148, 0,
-
-    149, 151, 167, 0, // row 45
-    135, 149, 15,  0,
-    101, 228, 126, 0,
-    184, 121, 29,  0,
-    168, 0,   144, 0,
-    82,  67,  235, 0,
-    181, 45,  153, 0,
-    177, 114, 93,  0
-};
-
-// clang-format on
-
-// Base graph 2 is taken from 3GPP standard document 38.212 table 5.3.2-3.
-// The format is consistent with the base graph 1 described above
-
-// The row start indices for base graph 2
-const uint32_t bg2_row_start[] = {
-    0,   8,   18,  26,  36,  40,  46,  52,  58,  62,  67,  72,  77,  81,  86,
-    91,  95,  100, 105, 109, 113, 117, 121, 124, 128, 132, 135, 140, 143, 147,
-    150, 155, 158, 162, 166, 170, 174, 178, 181, 185, 189, 193, 197};
-
-// clang-format off
-const uint32_t bg2_columns[] = {
-  0,  1,  2,  3,  6,  9,  10,  11,           // row 0: 8
-  0,  3,  4,  5,  6,  7,  8,   9,   11,  12, // row 1: 10
-  0,  1,  3,  4,  8,  10, 12,  13,           // row 2: 8
-  1,  2,  4,  5,  6,  7,  8,   9,  10,  13,  // row 3: 10
-  0,  1,  11, 14,                            // row 4: 4
-  0,  1,  5,  7,  11,  15,                   // row 5: 6
-  0,  5,  7,  9,  11,  16,                   // row 6: 6
-  1,  5,  7,  11, 13,  17,                   // row 7: 6
-  0,  1,  12, 18,                            // row 8: 4
-  1,  8,  10, 11, 19,                        // row 9: 5
-  0,  1,  6,  7,  20,                        // row 10: 5
-  0,  7,  9,  13, 21,                        // row 11: 5
-  1,  3,  11, 22,                            // row 12: 4
-  0,  1,  8,  13, 23,                        // row 13: 5
-  1,  6,  11, 13, 24,                        // row 14: 5
-  0,  10, 11, 25,                            // row 15: 4
-  1,  9,  11, 12, 26,                        // row 16: 5
-  1,  5,  11, 12, 27,                        // row 17: 5
-  0,  6,  7,  28,                            // row 18: 4
-  0,  1,  10, 29,                            // row 19: 4
-  1,  4,  11, 30,                            // row 20: 4
-  0,  8,  13, 31,                            // row 21: 4
-  1,  2,  32,                                // row 22: 3
-  0,  3,  5,  33,                            // row 23: 4
-  1,  2,  9,  34,                            // row 24: 4
-  0,  5,  35,                                // row 25: 3
-  2,  7,  12,  13,  36,                      // row 26: 5
-  0,  6,  37,                                // row 27: 3
-  1,  2,  5,  38,                            // row 28: 4
-  0,  4,  39,                                // row 29: 3
-  2,  5,  7,  9,  40,                        // row 30: 5
-  1,  13, 41,                                // row 31: 3
-  0,  5,  12, 42,                            // row 32: 4
-  2,  7,  10, 43,                            // row 33: 4
-  0,  12, 13, 44,                            // row 34: 4
-  1,  5,  11, 45,                            // row 35: 4
-  0,  2,  7,  46,                            // row 36: 4
-  10, 13, 47,                                // row 37: 3
-  1,  5,  11,  48,                           // row 38: 4
-  0,  7,  12,  49,                           // row 39: 4
-  2,  10, 13,  50,                           // row 40: 4
-  1,  5,  11,  51                            // row 41: 4
-};
-
-const uint32_t bg2_shifts[] = {
-  9,   117, 204, 26, 189, 205,  0, 0, // row 0
-  174, 97,  166, 66,  71,  172, 0, 0,
-  0,   0,   0,   0,   0,   0,   0, 0,
-  72,  110, 23,  181, 95,  8,   1, 0,
-  3,   26,  53,  35,  115, 127, 0, 0,
-  156, 143, 14,  3,   40,  123, 0, 0,
-  143, 19,  176, 165, 196, 13,  0, 0,
-  145, 131, 71,  21,  23,  112, 1, 0,
-
-  167, 166, 253, 125, 226, 156, 224, 252, 0, 0,  // row 1
-  27,  36,  48,  92,  31,  187, 185, 3,   0, 0,
-  137, 124, 0,   0,   88,  0,   0,   55,  0, 0,
-  53,  156, 115, 156, 115, 200, 29,  31,  0, 0,
-  19,  94,  104, 66,  84,  98,  69,  50,  0, 0,
-  17,  65,  63,  1,   55,  37,  171, 133, 0, 0,
-  18,  27,  3,   102, 185, 17,  14,  180, 0, 0,
-  142, 174, 183, 27,  96,  23,  9,   167, 0, 0,
-
-  81,  114,  44,  52,  240,  1,  0,  0, // row 2
-  25,  114,  117, 110, 114,  1,  0,  0,
-  20,  94,   99,  9,   108,  1,  0,  0,
-  152, 131,  46,  191, 91,   0,  0,  0,
-  95,  106,  92,  110, 111,  1,  0,  0,
-  98,  168,  107,  82, 142,  1,  0,  0,
-  126, 163,  47,  183, 132,  1,  0,  0,
-  74,  31,   3,   53,  155,  0,  0,  0,
-
-  8,   58,  158, 104, 209, 54,  18,  128,  0,  0, // row 3
-  136, 175, 113, 72,  123, 118, 28,  186,  0,  0,
-  38,  15,  102, 146, 12,  57,  53,  46,   0,  0,
-  185, 6,   36,  124, 124, 110, 156, 133,  1,  0,
-  120, 121, 22,  4,   73,  49,  128, 79,   0,  0,
-  53,  174, 174, 127, 17,  89,  17,  105,  0,  0,
-  36,  48,  18,  111, 203, 3,   191, 160,  0,  0,
-  239, 171, 95,  110, 159, 199, 43,  75,   1,  0,
-
-  179,  214,  71,  0,  // row 4
-  72,   74,   29,  0,
-  0,   136,   157, 0,
-  200, 16,    101, 0,
-  42,  24,    51,  0,
-  86,  67,    83,  0,
-  43,  27,    117, 0,
-  29,  140,   180, 0,
-
-  231,  41,  194,  159,  103,  0, // row 5
-  10,   44,  121,  80,   48,   0,
-  0,   131,  142,  141,  64,   0,
-  185, 138,  170,  219,  193,  0,
-  40,  140,  84,   137,  71,   0,
-  79,  84,   35,   103,  60,   0,
-  136, 49,   36,   132,  62,   0,
-  121, 41,   169,  88,  207,   0,
-
-  155, 228,  45,  28,  158,   0, // row 6
-  129, 92,   100, 49,  184,   0,
-  0,   124,  99,  45,  148,   0,
-  123, 55,   31,  222, 209,   0,
-  109, 87,   107, 133, 139,   0,
-  47,  154,  10,  155, 29,    0,
-  7,   34,   198, 168, 12,    0,
-  137, 72,   172, 124, 56,    0,
-
-  129, 147,  140, 3,   116,   0,  // row 7
-  80,  186,  16,  102, 143,   0,
-  0,   45,   148, 96,  78,    0,
-  103, 13,   105, 150, 181,   0,
-  97,  135,  35,  108, 65,    0,
-  48,  125,  24,  47,  55,    0,
-  163, 78,   143, 107, 58,    0,
-  86,  186,  87,  172, 154,   0,
-
-  142, 94,   230, 0, // row 8
-  118, 70,   152, 0,
-  0,   65,   87,  0,
-  147, 43,   152,  0,
-  70,  69,   88,  0,
-  53,  31,  161,  0,
-  101, 177, 22,   0,
-  176, 169, 225,  0,
-
-  203,  205,  61,  247,  0,  // row 9
-  28,   132,  185, 178,  0,
-  0,    97,   51,  85,   0,
-  2,    30,   184, 83,   0,
-  97,   40,   24,  49,   0,
-  104,  142,  99,  64,   0,
-  186,  27,   205, 81,   0,
-  167,  238,  48,  68,   0,
-
-  11,  185,  0,   117,  0, // row 10
-  59,  104,  22,  52,   0,
-  0,   17,   156, 20,   0,
-  174, 150,  8,   56,   0,
-  46,  41,   101, 96,   0,
-  111, 25,   174, 23,   0,
-  125, 60,   177, 51,   0,
-  38,  217,  208, 232,  0,
-
-  11,  236,  210, 56,   0, // row 11
-  32,  92,   174, 154,  0,
-  0,   7,    4,   2,    0,
-  99,  138,  110, 99,   0,
-  28,  30,   116, 64,   0,
-  91,  175,  24,  141,  0,
-  39,  29,   35,  8,    0,
-  178, 214,  168, 51,   0,
-
-  63,  111,  14,  0,  // row 12
-  39,  93,   11,  0,
-  0,   113,  48,  0,
-  46,  217,  109, 0,
-  33,  122,  131, 0,
-  122, 11,   4,   0,
-  18,  155,  49,  0,
-  124, 122,  72,  0,
-
-  83,  2,   38,  222,  0,  // row 13
-  49,  125, 35,  166,  0,
-  0,   112, 102, 26,   0,
-  37,  113, 143, 140,  0,
-  76,  37,  62,  47,   0,
-  29,  91,  27,  127,  0,
-  32,  53,  95,  186,  0,
-  48,  57,  167, 219,  0,
-
-  115, 145, 3,   232,  0, // row 14
-  19,  118, 21,  163,  0,
-  0,   138, 57,  27,   0,
-  36,  95,  40,  116,  0,
-  143, 51,  130, 97,   0,
-  11,  145, 8,   166,  0,
-  91,  20,  52,  109,  0,
-  82,  232, 204, 162,  0,
-
-  51,  175,  213,  0,  // row 15
-  68,  63,   81,   0,
-  0,   73,   99,   0,
-  116, 200,  110,  0,
-  139, 96,   128,  0,
-  137, 103,  40,   0,
-  174, 108,  102,  0,
-  38,  217,  157,  0,
-
-  203, 142,  8,    242,  0, // row 16
-  87,  177,  135,  64,   0,
-  0,   79,   111,  143,  0,
-  75,  158,  134,  97,   0,
-  48,  9,    28,   8,    0,
-  78,  158,  17,   165,  0,
-  125, 31,   54,   176,  0,
-  170, 23,  175,   202,  0,
-
-  254,  124,  114,  64,  0, // row 17
-  158,  23,   9,    6,   0,
-  0,    24,   109,  18,  0,
-  48,   132,  206,  2,   0,
-  120,  43,   65,   42,  0,
-  134,  23,   62,   163, 0,
-  57,   201,  142,  35,  0,
-  196,  173,  195,  218, 0,
-
-  220,  194,  50,  0,  // row 18
-  186,  6,    46,  0,
-  0,    18,   86,  0,
-  68,   16,   156, 0,
-  17,   106,  142, 0,
-  173,  31,   22,  0,
-  129,  203,  140, 0,
-  128,  211,  210, 0,
-
-  87,   20,   185,  0,  // row 19
-  58,   42,   156,  0,
-  0,   158,   154,  0,
-  35,  138,   86,   0,
-  79,  28,    41,   0,
-  13,  135,   145,  0,
-  110, 124,   52,   0,
-  39,  84,    88,   0,
-
-  26,  105,  29,   0, // row 20
-  76,  61,  153,   0,
-  0,   148, 104,   0,
-  6,   20,  141,   0,
-  2,   103, 78,    0,
-  128, 52,  173,   0,
-  196, 35,  114,   0,
-  117, 227, 6,     0,
-
-  76,  42,  210,  0,  // row 21
-  157, 175, 67,   0,
-  0,   17,  33,   0,
-  80,  43,  81,   0,
-  91,  75,  81,   0,
-  156, 166, 40,   0,
-  10,  122, 23,   0,
-  238,  13,  11,  0,
-
-  222,  63,  0, // row 22
-  20,   52,  0,
-  0,    4,   0,
-  49,   1,   0,
-  54,   132, 0,
-  18,   163, 0,
-  202,  126, 0,
-  195,  44,  0,
-
-  23,   235, 238,  0, // row 23
-  106,  86,  95,   0,
-  0,    75,  158,  0,
-  156,  54,  134,  0,
-  68,   115, 56,   0,
-  110,  132, 150,  0,
-  52,   170, 13,   0,
-  5,    94,  111,  0,
-
-  46,  139,  8,    0, // row 24
-  182, 153,  64,   0,
-  0,   69,   87,   0,
-  153, 88,   63,   0,
-  30,  42,   101,  0,
-  113, 108,  61,   0,
-  113, 161,  88,   0,
-  81,  19,   130,  0,
-
-  228, 156,  0,  // row 25
-  45,  21,   0,
-  0,   65,   0,
-  211, 94,   0,
-  128, 63,   0,
-  72,  136,  0,
-  197, 194,  0,
-  66,  95,   0,
-
-  29,  143,  160,  122, 0, // row 26
-  67,  137,  55,   85,  0,
-  0,   100,  13,   7,   0,
-  90,  6,    221,  6,   0,
-  142, 28,   100,  133, 0,
-  36,  38,   53,   145, 0,
-  164, 172,  49,   161, 0,
-  146, 66,   190,  86,  0,
-
-  8,   151,  0, // row 27
-  103, 50,   0,
-  0,   32,   0,
-  27,  118,  0,
-  13,  10,   0,
-  42,  104,  0,
-  168, 193,  0,
-  64,  181,  0,
-
-  98,  101,  135,  0, // row 28
-  70,  111,  168,  0,
-  0,   126,  110,  0,
-  216, 212,  193,  0,
-  106, 77,   43,   0,
-  64,  24,   149,  0,
-  14,  186,  46,   0,
-  7,   144,  16,   0,
-
-  18,  28,  0, // row 29
-  110, 17,  0,
-  0,   154, 0,
-  108, 61,  0,
-  133, 25,  0,
-  139, 161, 0,
-  50,  27,  0,
-  25,  57,  0,
-
-  71,  240, 9,   84,  0, // row 30
-  120, 154, 52,  56,  0,
-  0,   35,  51,  134, 0,
-  106, 44,  185, 176, 0,
-  87,  56,  104, 70,  0,
-  84,  173, 93,  29,  0,
-  70,  17,  50,  6,   0,
-  37,  139, 221, 17,  0,
-
-  106, 1,   0, // row 31
-  3,   170, 0,
-  0,   20,  0,
-  147, 182, 0,
-  80,  139, 0,
-  117, 148, 0,
-  115, 189, 0,
-  201, 46,  0,
-
-  242,  44,  166,  0, // row 32
-  84,   8,   17,   0,
-  0,    20,  122,  0,
-  108,  21,  110,  0,
-  32,   89,  71,   0,
-  116,  73,  142,  0,
-  110,  0,   163,  0,
-  179,  14,  116,  0,
-
-  132,  164,  235,  0, // row 33
-  165,  179,  124,  0,
-  0,    88,   13,   0,
-  71,   12,   109,  0,
-  135,  6,    2,    0,
-  105,  137,  29,   0,
-  163,  173,  179,  0,
-  46,   2,    106,  0,
-
-  147,  85,   36,   0, // row 34
-  173,  177,  12,   0,
-  0,    19,   78,   0,
-  29,   201,  69,   0,
-  37,   25,   114,  0,
-  11,   41,   162,  0,
-  197,  191,  193,  0,
-  184,  135,  141,  0,
-
-  57,   40,   63,  0, // row 35
-  77,   184,  18,  0,
-  0,    157,  6,   0,
-  91,   165,  55,  0,
-  60,   137,  93,  0,
-  126,  152,  172, 0,
-  157,  167,  181, 0,
-  85,   225,  175, 0,
-
-  140,  38,  154,  0, // row 36
-  25,  151,  170,  0,
-  0,   63,   82,   0,
-  1,   175,  83,   0,
-  121, 129,  26,   0,
-  73,  154,  129,  0,
-  197, 167,  179,  0,
-  178, 112,  106,  0,
-
-  219,  151,  0, // row 37
-  37,   31,   0,
-  0,    144,  0,
-  40,   12,   0,
-  97,   56,   0,
-  167,  38,   0,
-  181,  193,  0,
-  154,  114,  0,
-
-  31,  66,   38,  0,  // row 38
-  84,  151,  190, 0,
-  0,   93,   19,  0,
-  37,  97,   46,  0,
-  1,   70,   1,   0,
-  112, 7,   19,   0,
-  157, 173, 191,  0,
-  42,  41,  105,  0,
-
-  239,  172,  34,  0, // row 39
-  93,   132,  57,  0,
-  0,    24,   138, 0,
-  106,  181,  154, 0,
-  119,  32,   142, 0,
-  109,  6,    105, 0,
-  181,  157,  173, 0,
-  167,  45,   189, 0,
-
-  0,    75,   120, 0, // row 40
-  103,  107,  163, 0,
-  0,    36,   143,  0,
-  98,   35,   36,  0,
-  6,    73,   102, 0,
-  160,  156,  82,  0,
-  193,  163,  179, 0,
-  78,   67,   180, 0,
-
-  129,  229,  118, 0, // row 41
-  147,  7,    60,  0,
-  0,    2,    55,  0,
-  120,  101,  81,  0,
-  48,   47,   19,  0,
-  132,  6,    8,   0,
-  191,  197,  167, 0,
-  53,   215,  230, 0
-};
-
 // clang-format on
 
 inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
diff --git a/src/UpperPHY/LDPC/ldpc_tables.hpp b/src/UpperPHY/LDPC/ldpc_tables.hpp
new file mode 100644
index 0000000..74585d4
--- /dev/null
+++ b/src/UpperPHY/LDPC/ldpc_tables.hpp
@@ -0,0 +1,934 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include "armral.h"
+// The base graphs are given in compressed sparse row format. We need three
+// arrays for this.  Firstly we have the row start indices, which stores the
+// indices into another array which indicates where the row starts. The next
+// array stores the indices of the columns which are non-zero in a row. Finally,
+// we have an array of values corresponding to the non-zero entries in the
+// matrix.
+// For example, `bg1_row_start[3]` is the index into `bg1_columns` for the
+// start of the fourth row, and `bg1_columns[bg1_row_starts[3]]` is the index of
+// a column in the fourth row of the matrix which contains a non-zero value.
+
+// Base graph 1 is taken from 3GPP standard document 38.212 table 5.3.2-2.
+
+// The row start indices for base graph 1
+const uint32_t bg1_row_start[] = {
+    0,   19,  38,  57,  76,  79,  87,  96,  103, 113, 122, 129,
+    137, 144, 150, 157, 164, 170, 176, 182, 188, 194, 200, 205,
+    210, 216, 221, 226, 230, 235, 240, 245, 250, 255, 260, 265,
+    270, 275, 279, 284, 289, 293, 298, 302, 307, 312, 316};
+
+// clang-format off
+const uint32_t bg1_columns[] = {
+    0,  1,  2,  3,  5,  6,  9,  10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 23, // row 0: 19
+    0,  2,  3,  4,  5,  7,  8,  9,  11, 12, 14, 15, 16, 17, 19, 21, 22, 23, 24, // row 1: 19
+    0,  1,  2,  4,  5,  6,  7,  8,  9,  10, 13, 14, 15, 17, 18, 19, 20, 24, 25, // row 2: 19
+    0,  1,  3,  4,  6,  7,  8,  10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 25, // row 3: 19
+    0,  1,  26,                             // row 4: 3
+    0,  1,  3,  12, 16, 21, 22, 27,         // row 5: 8
+    0,  6,  10, 11, 13, 17, 18, 20, 28,     // row 6: 9
+    0,  1,  4,  7,  8,  14, 29,             // row 7: 7
+    0,  1,  3,  12, 16, 19, 21, 22, 24, 30, // row 8: 10
+    0,  1,  10, 11, 13, 17, 18, 20, 31,     // row 9: 9
+    1,  2,  4,  7,  8,  14, 32,             // row 10: 7
+    0,  1,  12, 16, 21, 22, 23, 33,         // row 11: 8
+    0,  1,  10, 11, 13, 18, 34,             // row 12: 7
+    0,  3,  7,  20, 23, 35,                 // row 13: 6
+    0,  12, 15, 16, 17, 21, 36,             // row 14: 7
+    0,  1,  10, 13, 18, 25, 37,             // row 15: 7
+    1,  3,  11, 20, 22, 38,                 // row 16: 6
+    0,  14, 16, 17, 21, 39,                 // row 17: 6
+    1,  12, 13, 18, 19, 40,                 // row 18: 6
+    0,  1,  7,  8,  10, 41,                 // row 19: 6
+    0,  3,  9,  11, 22, 42,                 // row 20: 6
+    1,  5,  16, 20, 21, 43,                 // row 21: 6
+    0,  12, 13, 17, 44,                     // row 22: 5
+    1,  2,  10, 18, 45,                     // row 23: 5
+    0,  3,  4,  11, 22, 46,                 // row 24: 6
+    1,  6,  7,  14, 47,                     // row 25: 5
+    0,  2,  4,  15, 48,                     // row 26: 5
+    1,  6,  8,  49,                         // row 27: 4
+    0,  4,  19, 21, 50,                     // row 28: 5
+    1,  14, 18, 25, 51,                     // row 29: 5
+    0,  10, 13, 24, 52,                     // row 30: 5
+    1,  7,  22, 25, 53,                     // row 31: 5
+    0,  12, 14, 24, 54,                     // row 32: 5
+    1,  2,  11, 21, 55,                     // row 33: 5
+    0,  7,  15, 17, 56,                     // row 34: 5
+    1,  6,  12, 22, 57,                     // row 35: 5
+    0,  14, 15, 18, 58,                     // row 36: 5
+    1,  13, 23, 59,                         // row 37: 4
+    0,  9,  10, 12, 60,                     // row 38: 5
+    1,  3,  7,  19, 61,                     // row 39: 5
+    0,  8,  17, 62,                         // row 40: 4
+    1,  3,  9,  18, 63,                     // row 41: 5
+    0,  4,  24, 64,                         // row 42: 4
+    1,  16, 18, 25, 65,                     // row 43: 5
+    0,  7,  9,  22, 66,                     // row 44: 5
+    1,  6,  10, 67                          // row 45: 4
+};
+
+// The shifts are organized by row, and then by index set. Each line in the
+// following represents the shifts in one index set for one block row of the
+// matrix. Indexing into the array works as follows. If we are using index set k
+// for k in [0, 7], and are on block row i, then the indexing function from k, i
+// to j is ind(k, i) = 8 * bg1_row_start[i] + (bg1_row_start[i+1] -
+// bg1_row_start[i]) * k
+const uint32_t bg1_shifts[] = {
+    250, 69,  226, 159, 100, 10,  59,  229, 110, 191, 9,   195, 23,  190, 35, 239, 31,   1,   0, // row 0
+    307, 19,  50,  369, 181, 216, 317, 288, 109, 17,  357, 215, 106, 242, 180, 330, 346, 1,   0,
+    73,  15,  103, 49,  240, 39,  15,  162, 215, 164, 133, 298, 110, 113, 16,  189, 32,  1,   0,
+    223, 16,  94,  91,  74,  10,  0,   205, 216, 21,  215, 14,  70,  141, 198, 104, 81,  1,   0,
+    211, 198, 188, 186, 219, 4,   29,  144, 116, 216, 115, 233, 144, 95,  216, 73,  261, 1,   0,
+    294, 118, 167, 330, 207, 165, 243, 250, 1,   339, 201, 53,  347, 304, 167, 47,  188, 1,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    135, 227, 126, 134, 84,  83,  53,  225, 205, 128, 75,  135, 217, 220, 90,  105, 137, 1,   0,
+
+    2,   239, 117, 124, 71,  222, 104, 173, 220, 102, 109, 132, 142, 155, 255, 28,  0,   0,   0, // row 1
+    76,  76,  73,  288, 144, 331, 331, 178, 295, 342, 217, 99,  354, 114, 331, 112, 0,   0,   0,
+    303, 294, 27,  261, 161, 133, 4,   80,  129, 300, 76,  266, 72,  83,  260, 301, 0,   0,   0,
+    141, 45,  151, 46,  119, 157, 133, 87,  206, 93,  79,  9,   118, 194, 31,  187, 0,   0,   0,
+    179, 162, 223, 256, 160, 76,  202, 117, 109, 15,  72,  152, 158, 147, 156, 119, 0,   0,   0,
+    77,  225, 96,  338, 268, 112, 302, 50,  167, 253, 334, 242, 257, 133, 9,   302, 0,   0,   0,
+    22,  11,  124, 0,   10,  0,   0,   2,   16,  60,  0,   6,   30,  0,   168, 31,  105, 0,   0,
+    96,  236, 136, 221, 128, 92,  172, 56,  11,  189, 95,  85,  153, 87,  163, 216, 0,   0,   0,
+
+    106, 111, 185, 63,  117, 93,  229, 177, 95,  39,  142, 225, 225, 245, 205, 251, 117, 0,   0, // row 2
+    205, 250, 328, 332, 256, 161, 267, 160, 63,  129, 200, 88,  53,  131, 240, 205, 13,  0,   0,
+    68,  7,   80,  280, 38,  227, 202, 200, 71,  106, 295, 283, 301, 184, 246, 230, 276, 0,   0,
+    207, 203, 31,  176, 180, 186, 95,  153, 177, 70,  77,  214, 77,  198, 117, 223, 90,  0,   0,
+    258, 167, 220, 133, 243, 202, 218, 63,  0,   3,   74,  229, 0,   216, 269, 200, 234, 0,   0,
+    226, 35,  213, 302, 111, 265, 128, 237, 294, 127, 110, 286, 125, 131, 163, 210, 7,   0,   0,
+    132, 37,  21,  180, 4,   149, 48,  38,  122, 195, 155, 28,  85,  47,  179, 42,  66,  0,   0,
+    189, 4,   225, 151, 236, 117, 179, 92,  24,  68,  6,   101, 33,  96,  125, 67,  230, 0,   0,
+
+    121, 89,  84,  20,  150, 131, 243, 136, 86,  246, 219, 211, 240, 76,  244, 144, 12,  1,   0, // row 3
+    276, 87,  0,   275, 199, 153, 56,  132, 305, 231, 341, 212, 304, 300, 271, 39,  357, 1,   0,
+    220, 208, 30,  197, 61,  175, 79,  281, 303, 253, 164, 53,  44,  28,  77,  319, 68,  1,   0,
+    201, 18,  165, 5,   45,  142, 16,  34,  155, 213, 147, 69,  96,  74,  99,  30,  158, 1,   0,
+    187, 145, 166, 108, 82,  132, 197, 41,  162, 57,  36,  115, 242, 165, 0,   113, 108, 1,   0,
+    97,  94,  49,  279, 139, 166, 91,  106, 246, 345, 269, 185, 249, 215, 143, 121, 121, 1,   0,
+    4,   6,   33,  113, 49,  21,  6,   151, 83,  154, 87,  5,   92,  173, 120, 2,   142, 0,   0,
+    128, 23,  162, 220, 43,  186, 96,  1,   216, 22,  24,  167, 200, 32,  235, 172, 219, 1,   0,
+
+    157, 102, 0, // row 4
+    332, 181, 0,
+    233, 205, 0,
+    170, 10,  0,
+    246, 235, 0,
+    42,  256, 0,
+    24,  204, 0,
+    64,  211, 0,
+
+    205, 236, 194, 231, 28,  123, 115, 0, // row 5
+    195, 14,  115, 166, 241, 51,  157, 0,
+    83,  292, 50,  318, 201, 267, 279, 0,
+    164, 59,  86,  80,  182, 130, 153, 0,
+    261, 181, 72,  283, 254, 79,  144, 0,
+    219, 130, 251, 322, 295, 258, 283, 0,
+    185, 100, 24,  65,  207, 161, 72,  0,
+    2,   171, 47,  143, 210, 180, 180, 0,
+
+    183, 22,  28,  67,  244, 11,  157, 211, 0, // row 6
+    278, 257, 1,   351, 92,  253, 18,  225, 0,
+    289, 21,  293, 13,  232, 302, 138, 235, 0,
+    158, 119, 113, 21,  63,  51,  136, 116, 0,
+    80,  144, 169, 90,  59,  177, 151, 108, 0,
+    294, 73,  330, 99,  172, 150, 284, 305, 0,
+    6,   27,  163, 50,  48,  24,  38,  91,  0,
+    199, 22,  23,  100, 92,  207, 52,  13,  0,
+
+    220, 44,  159, 31,  167, 104, 0, // row 7
+    9,   62,  316, 333, 290, 114, 0,
+    12,  88,  207, 50,  25,  76,  0,
+    17,  76,  104, 100, 150, 158, 0,
+    169, 189, 154, 184, 104, 164, 0,
+    3,   103, 224, 297, 215, 39,  0,
+    145, 88,  112, 153, 159, 76,  0,
+    77,  146, 209, 32,  166, 18,  0,
+
+    112, 4,   7,   211, 102, 164, 109, 241, 90,  0, // row 8
+    307, 179, 165, 18,  39,  224, 368, 67,  170, 0,
+    295, 133, 130, 231, 296, 110, 269, 245, 154, 0,
+    33,  95,  4,   217, 204, 39,  58,  44,  201, 0,
+    54,  0,   252, 41,  98,  46,  15,  230, 54,  0,
+    348, 75,  22,  312, 224, 17,  59,  314, 244, 0,
+    172, 2,   131, 141, 96,  99,  101, 35,  116, 0,
+    181, 105, 141, 223, 177, 145, 199, 153, 38,  0,
+
+    103, 182, 109, 21,  142, 14,  61,  216, 0, // row 9
+    366, 232, 321, 133, 57,  303, 63,  82,  0,
+    189, 244, 36,  286, 151, 267, 135, 209, 0,
+    9,   37,  213, 105, 89,  185, 109, 218, 0,
+    162, 159, 93,  134, 45,  132, 76,  209, 0,
+    156, 88,  293, 111, 92,  152, 23,  337, 0,
+    6,   10,  145, 53,  201, 4,   164, 173, 0,
+    169, 12,  206, 221, 17,  212, 92,  205, 0,
+
+    98,  149, 167, 160, 49,  58,  0, // row 10
+    101, 339, 274, 111, 383, 354, 0,
+    14,  80,  211, 75,  161, 311, 0,
+    82,  165, 174, 19,  194, 103, 0,
+    178, 1,   28,  267, 234, 201, 0,
+    175, 253, 27,  231, 49,  267, 0,
+    126, 77,  156, 16,  12,  70,  0,
+    116, 151, 70,  230, 115, 84,  0,
+
+    77,  41,  83,  182, 78,  252, 22,  0, // row 11
+    48,  102, 8,   47,  188, 334, 115, 0,
+    16,  147, 290, 289, 177, 43,  280, 0,
+    52,  11,  2,   35,  32,  84,  201, 0,
+    55,  23,  274, 181, 273, 39,  26,  0,
+    25,  322, 200, 351, 166, 338, 192, 0,
+    184, 194, 123, 16,  104, 109, 124, 0,
+    45,  115, 134, 1,   152, 165, 107, 0,
+
+    160, 42,  21,  32,  234, 7,   0, // row 12
+    77,  186, 174, 232, 50,  74,  0,
+    229, 235, 169, 48,  105, 52,  0,
+    142, 175, 136, 3,   28,  182, 0,
+    225, 162, 244, 151, 238, 243, 0,
+    123, 217, 142, 110, 176, 76,  0,
+    6,   20,  203, 153, 104, 207, 0,
+    186, 215, 124, 180, 98,  80,  0,
+
+    177, 248, 151, 185, 62,  0, // row 13
+    313, 177, 266, 115, 370, 0,
+    39,  302, 303, 160, 37,  0,
+    81,  56,  72,  217, 78,  0,
+    231, 0,   216, 47,  36,  0,
+    311, 251, 265, 94,  81,  0,
+    52,  147, 1,   16,  46,  0,
+    220, 185, 154, 178, 150, 0,
+
+    206, 55,  206, 127, 16,  229, 0, // row 14
+    142, 248, 137, 89,  347, 12,  0,
+    78,  299, 54,  61,  179, 258, 0,
+    14,  175, 211, 191, 51,  43,  0,
+    0,   186, 253, 16,  0,   79,  0,
+    22,  322, 277, 156, 66,  78,  0,
+    1,   202, 118, 130, 1,   2,   0,
+    124, 144, 182, 95,  72,  76,  0,
+
+    40,  96,  65,  63,  75,  179, 0, // row 15
+    241, 2,   210, 318, 55,  269, 0,
+    229, 290, 60,  130, 184, 51,  0,
+    90, 120, 131, 209, 209, 81,  0,
+    170, 0,   183, 108, 68,  64,  0,
+    176, 348, 15,  81,  176, 113, 0,
+    173, 6,   81,  182, 53,  46,  0,
+    39,  138, 220, 173, 142, 49,  0,
+
+    64,  49,  49,  51,  154, 0, // row 16
+    13,  338, 57,  289, 57,  0,
+    69,  140, 45,  115, 300, 0,
+    154, 164, 43, 189, 101, 0,
+    270, 13,  99,  54,  0,   0,
+    190, 293, 332, 331, 114, 0,
+    88,  198, 160, 122, 182, 0,
+    78,  152, 84,  5,   205, 0,
+
+    7,   164, 59,  1,   144, 0, // row 17
+    260, 303, 81,  358, 375, 0,
+    257, 147, 128, 51,  228, 0,
+    56,  110, 200, 63,  4,   0,
+    153, 137, 0,   0,   162, 0,
+    110, 228, 247, 116, 190, 0,
+    91,  184, 30,  3,   155, 0,
+    183, 112, 106, 219, 129, 0,
+
+    42,  233, 8,   155, 147, 0, // row 18
+    130, 163, 280, 132, 4,   0,
+    260, 294, 291, 141, 295, 0,
+    199, 110, 200, 143, 186, 0,
+    161, 151, 0,   241, 144, 0,
+    47,  286, 246, 181, 73,  0,
+    1,   41,  167, 68,  148, 0,
+    183, 215, 180, 143, 14,  0,
+
+    60,  73,  72,  127, 224, 0, // row 19
+    145, 213, 344, 242, 197, 0,
+    64,  181, 101, 270, 41,  0,
+    8,   6,   103, 198, 8,   0,
+    0,   0,   118, 144, 0,   0,
+    87,  110, 147, 258, 204, 0,
+    12,  6,   166, 184, 191, 0,
+    179, 108, 159, 138, 196, 0,
+
+    151, 186, 217, 47,  160, 0, // row 20
+    187, 206, 264, 341, 59,  0,
+    301, 162, 40,  130, 10,  0,
+    105, 210, 121, 214, 183, 0,
+    265, 81,  90,  144, 228, 0,
+    89,  65,  155, 244, 30,  0,
+    6,   12,  15,  5,   30,  0,
+    77,  187, 203, 167, 130, 0,
+
+    249, 121, 109, 131, 171, 0, // row 21
+    205, 102, 328, 213, 97,  0,
+    79,  175, 132, 283, 103, 0,
+    192, 131, 220, 50,  106, 0,
+    64,  46,  266, 9,   18,  0,
+    162, 264, 346, 143, 109, 0,
+    6,   86,  96,  42,  199, 0,
+    197, 122, 215, 65,  216, 0,
+
+    64,  142, 188, 158, 0, // row 22
+    30,  11,  233, 22,  0,
+    177, 20,  55,  316, 0,
+    53,  0,   3,   148, 0,
+    72,  189, 72,  257, 0,
+    280, 157, 236, 113, 0,
+    44,  58,  130, 131, 0,
+    25,  47,  126, 178, 0,
+
+    156, 147, 170, 152, 0, // row 23
+    24,  89,  61,  27,  0,
+    249, 50,  133, 105, 0,
+    88,  203, 168, 122, 0,
+    180, 0,   0,   165, 0,
+    18,  6,   181, 304, 0,
+    45,  18,  132, 100, 0,
+    185, 127, 117, 199, 0,
+
+    112, 86,  236, 116, 222, 0, // row 24
+    298, 158, 235, 339, 234, 0,
+    289, 280, 110, 187, 281, 0,
+    49,  157, 64, 193, 124,  0,
+    236, 199, 0,   266, 0,   0,
+    38,  170, 249, 288, 194, 0,
+    9,   125, 191, 28,  6,   0,
+    32,  178, 2,   156, 58,  0,
+
+    23,  136, 116, 182, 0, // row 25
+    72,  17,  383, 312, 0,
+    172, 295, 96,  46,  0,
+    1,   166, 65,  81,  0,
+    205, 0,   0,   183, 0,
+    279, 255, 111, 54,  0,
+    4,   74,  16,  28,  0,
+    27,  141, 11,  181, 0,
+
+    195, 243, 215, 61,  0, // row 26
+    71,  81,  76,  136, 0,
+    270, 110, 318, 67,  0,
+    107, 176, 212, 127, 0,
+    0,   0,   0,   277, 0,
+    325, 326, 226, 99,  0,
+    21,  142, 192, 197, 0,
+    163, 131, 169, 98,  0,
+
+    25,  104, 194, 0, // row 27
+    194, 194, 101, 0,
+    210, 29,  304, 0,
+    208, 141, 174, 0,
+    45,  36,  72,  0,
+    91,  326, 268, 0,
+    98,  140, 22,  0,
+    165, 232, 9,   0,
+
+    128, 165, 181, 63,  0, // row 28
+    222, 19,  244, 274, 0,
+    11,  293, 50,  234, 0,
+    146, 153, 217, 114, 0,
+    275, 0,   155, 62,  0,
+    102, 1,   40,  167, 0,
+    4,   1,   40,  93,  0,
+    32,  43,  200, 205, 0,
+
+    86,  236, 84,  6,   0, // row 29
+    252, 5,   147, 78,  0,
+    27,  308, 117, 29,  0,
+    150, 11,  53,  68,  0,
+    0,   180, 0,   42,  0,
+    273, 104, 243, 107, 0,
+    92,  136, 106, 6,   0,
+    232, 32,  118, 103, 0,
+
+    216, 73,  120, 9,   0, // row 30
+    159, 229, 260, 90,  0,
+    91,  23,  105, 135, 0,
+    34,  130, 210, 123, 0,
+    0,   90,  252, 173, 0,
+    171, 16,  95,  212, 0,
+    2,   88,  112, 20,  0,
+    170, 199, 26,  105, 0,
+
+    95,  177, 172, 61,  0, // row 31
+    100, 215, 258, 256, 0,
+    222, 308, 66,  162, 0,
+    175, 49,  177, 128, 0,
+    144, 144, 166, 19,  0,
+    101, 297, 279, 222, 0,
+    4,   49,  125, 194, 0,
+    73,  149, 175, 108, 0,
+
+    221, 112, 199, 121, 0, // row 32
+    102, 201, 175, 287, 0,
+    210, 22,  271, 217, 0,
+    192, 209, 58,  30,  0,
+    0,   211, 36,  162, 0,
+    351, 265, 338, 83,  0,
+    6,   126, 63,  20,  0,
+    103, 110, 151, 211, 0,
+
+    2,   187, 41,  211, 0, // row 33
+    323, 8,   361, 105, 0,
+    170, 20,  140, 33,  0,
+    114, 49,  161, 137, 0,
+    0,   0,   76,  18,  0,
+    56,  304, 141, 101, 0,
+    10,  30,  6,   92,  0,
+    199, 132, 172, 65,  0,
+
+    127, 167, 164, 159, 0, // row 34
+    230, 148, 202, 312, 0,
+    187, 296, 5,   44,  0,
+    82,  186, 68,  150, 0,
+    197, 0,   108, 0,   0,
+    60,  320, 112, 54,  0,
+    4,   153, 197, 155, 0,
+    161, 237, 142, 180, 0,
+
+    161, 197, 207, 103, 0, // row 35
+    320, 335, 2,   266, 0,
+    207, 158, 55,  285, 0,
+    192, 173, 26,  187, 0,
+    199, 278, 0,   205, 0,
+    100, 210, 195, 268, 0,
+    4,   45,  168, 185, 0,
+    231, 174, 145, 100, 0,
+
+    37,  105, 51,  120, 0, // row 36
+    210, 313, 297, 21,  0,
+    259, 179, 178, 160, 0,
+    222, 157, 0,   6,   0,
+    216, 16,  0,   0,   0,
+    135, 15,  35,  188, 0,
+    6,   200, 177, 43,  0,
+    11,  207, 42,  100, 0,
+
+    198, 220, 122, 0, // row 37
+    269, 82,  115, 0,
+    298, 15,  115, 0,
+    81,  195, 138, 0,
+    72,  144, 0,   0,
+    319, 236, 85,  0,
+    82,  2,   135, 0,
+    59,  204, 161, 0,
+
+    167, 151, 157, 163, 0, // row 38
+    185, 177, 289, 214, 0,
+    151, 179, 64,  181, 0,
+    123, 90,  73,  10,  0,
+    190, 0,   0,   0,   0,
+    164, 196, 209, 246, 0,
+    91,  64,  198, 100, 0,
+    121, 90,  26,  140, 0,
+
+    173, 139, 149, 0,   0, // row 39
+    258, 93,  346, 297, 0,
+    102, 77,  192, 208, 0,
+    12,  77,  49,  114, 0,
+    153, 0,   165, 117, 0,
+    236, 264, 37,  272, 0,
+    4,   28,  109, 188, 0,
+    115, 188, 168, 52,  0,
+
+    157, 137, 149, 0, // row 40
+    175, 37,  312, 0,
+    32,  80,  197, 0,
+    67,  45,  96,  0,
+    216, 144, 2,   0,
+    304, 237, 135, 0,
+    10,  84,  12,  0,
+    4,   103, 30,  0,
+
+    167, 173, 139, 151, 0, // row 41
+    52,  314, 139, 288, 0,
+    154, 47,  124, 207, 0,
+    23,  215, 60,  167, 0,
+    0,   0,   0,   183, 0,
+    123, 77,  25,  272, 0,
+    2,   75,  142, 128, 0,
+    53,  189, 215, 24,  0,
+
+    149, 157, 137, 0, // row 42
+    113, 14,  218, 0,
+    226, 65,  126, 0,
+    114, 91,  78,  0,
+    27,  0,   35,  0,
+    288, 83,  17,  0,
+    163, 10,  162, 0,
+    222, 170, 71,  0,
+
+    151, 163, 173, 139, 0, // row 43
+    113, 132, 114, 168, 0,
+    228, 69,  176, 102, 0,
+    206, 22,  134, 161, 0,
+    52,  243, 0,   270, 0,
+    210, 3,   53,  167, 0,
+    1,   163, 99,  98,  0,
+    22,  127, 49,  125, 0,
+
+    139, 157, 163, 173, 0, // row 44
+    80,  78,  163, 274, 0,
+    234, 227, 259, 260, 0,
+    84,  4,   9,   12,  0,
+    18,  0,   0,   57,  0,
+    79,  244, 293, 272, 0,
+    4,   6,   142, 3,   0,
+    191, 211, 187, 148, 0,
+
+    149, 151, 167, 0, // row 45
+    135, 149, 15,  0,
+    101, 228, 126, 0,
+    184, 121, 29,  0,
+    168, 0,   144, 0,
+    82,  67,  235, 0,
+    181, 45,  153, 0,
+    177, 114, 93,  0
+};
+
+// clang-format on
+
+// Base graph 2 is taken from 3GPP standard document 38.212 table 5.3.2-3.
+// The format is consistent with the base graph 1 described above
+
+// The row start indices for base graph 2
+const uint32_t bg2_row_start[] = {
+    0,   8,   18,  26,  36,  40,  46,  52,  58,  62,  67,  72,  77,  81,  86,
+    91,  95,  100, 105, 109, 113, 117, 121, 124, 128, 132, 135, 140, 143, 147,
+    150, 155, 158, 162, 166, 170, 174, 178, 181, 185, 189, 193, 197};
+
+// clang-format off
+const uint32_t bg2_columns[] = {
+  0,  1,  2,  3,  6,  9,  10,  11,           // row 0: 8
+  0,  3,  4,  5,  6,  7,  8,   9,   11,  12, // row 1: 10
+  0,  1,  3,  4,  8,  10, 12,  13,           // row 2: 8
+  1,  2,  4,  5,  6,  7,  8,   9,  10,  13,  // row 3: 10
+  0,  1,  11, 14,                            // row 4: 4
+  0,  1,  5,  7,  11,  15,                   // row 5: 6
+  0,  5,  7,  9,  11,  16,                   // row 6: 6
+  1,  5,  7,  11, 13,  17,                   // row 7: 6
+  0,  1,  12, 18,                            // row 8: 4
+  1,  8,  10, 11, 19,                        // row 9: 5
+  0,  1,  6,  7,  20,                        // row 10: 5
+  0,  7,  9,  13, 21,                        // row 11: 5
+  1,  3,  11, 22,                            // row 12: 4
+  0,  1,  8,  13, 23,                        // row 13: 5
+  1,  6,  11, 13, 24,                        // row 14: 5
+  0,  10, 11, 25,                            // row 15: 4
+  1,  9,  11, 12, 26,                        // row 16: 5
+  1,  5,  11, 12, 27,                        // row 17: 5
+  0,  6,  7,  28,                            // row 18: 4
+  0,  1,  10, 29,                            // row 19: 4
+  1,  4,  11, 30,                            // row 20: 4
+  0,  8,  13, 31,                            // row 21: 4
+  1,  2,  32,                                // row 22: 3
+  0,  3,  5,  33,                            // row 23: 4
+  1,  2,  9,  34,                            // row 24: 4
+  0,  5,  35,                                // row 25: 3
+  2,  7,  12,  13,  36,                      // row 26: 5
+  0,  6,  37,                                // row 27: 3
+  1,  2,  5,  38,                            // row 28: 4
+  0,  4,  39,                                // row 29: 3
+  2,  5,  7,  9,  40,                        // row 30: 5
+  1,  13, 41,                                // row 31: 3
+  0,  5,  12, 42,                            // row 32: 4
+  2,  7,  10, 43,                            // row 33: 4
+  0,  12, 13, 44,                            // row 34: 4
+  1,  5,  11, 45,                            // row 35: 4
+  0,  2,  7,  46,                            // row 36: 4
+  10, 13, 47,                                // row 37: 3
+  1,  5,  11,  48,                           // row 38: 4
+  0,  7,  12,  49,                           // row 39: 4
+  2,  10, 13,  50,                           // row 40: 4
+  1,  5,  11,  51                            // row 41: 4
+};
+
+const uint32_t bg2_shifts[] = {
+  9,   117, 204, 26, 189, 205,  0, 0, // row 0
+  174, 97,  166, 66,  71,  172, 0, 0,
+  0,   0,   0,   0,   0,   0,   0, 0,
+  72,  110, 23,  181, 95,  8,   1, 0,
+  3,   26,  53,  35,  115, 127, 0, 0,
+  156, 143, 14,  3,   40,  123, 0, 0,
+  143, 19,  176, 165, 196, 13,  0, 0,
+  145, 131, 71,  21,  23,  112, 1, 0,
+
+  167, 166, 253, 125, 226, 156, 224, 252, 0, 0,  // row 1
+  27,  36,  48,  92,  31,  187, 185, 3,   0, 0,
+  137, 124, 0,   0,   88,  0,   0,   55,  0, 0,
+  53,  156, 115, 156, 115, 200, 29,  31,  0, 0,
+  19,  94,  104, 66,  84,  98,  69,  50,  0, 0,
+  17,  65,  63,  1,   55,  37,  171, 133, 0, 0,
+  18,  27,  3,   102, 185, 17,  14,  180, 0, 0,
+  142, 174, 183, 27,  96,  23,  9,   167, 0, 0,
+
+  81,  114,  44,  52,  240,  1,  0,  0, // row 2
+  25,  114,  117, 110, 114,  1,  0,  0,
+  20,  94,   99,  9,   108,  1,  0,  0,
+  152, 131,  46,  191, 91,   0,  0,  0,
+  95,  106,  92,  110, 111,  1,  0,  0,
+  98,  168,  107,  82, 142,  1,  0,  0,
+  126, 163,  47,  183, 132,  1,  0,  0,
+  74,  31,   3,   53,  155,  0,  0,  0,
+
+  8,   58,  158, 104, 209, 54,  18,  128,  0,  0, // row 3
+  136, 175, 113, 72,  123, 118, 28,  186,  0,  0,
+  38,  15,  102, 146, 12,  57,  53,  46,   0,  0,
+  185, 6,   36,  124, 124, 110, 156, 133,  1,  0,
+  120, 121, 22,  4,   73,  49,  128, 79,   0,  0,
+  53,  174, 174, 127, 17,  89,  17,  105,  0,  0,
+  36,  48,  18,  111, 203, 3,   191, 160,  0,  0,
+  239, 171, 95,  110, 159, 199, 43,  75,   1,  0,
+
+  179,  214,  71,  0,  // row 4
+  72,   74,   29,  0,
+  0,   136,   157, 0,
+  200, 16,    101, 0,
+  42,  24,    51,  0,
+  86,  67,    83,  0,
+  43,  27,    117, 0,
+  29,  140,   180, 0,
+
+  231,  41,  194,  159,  103,  0, // row 5
+  10,   44,  121,  80,   48,   0,
+  0,   131,  142,  141,  64,   0,
+  185, 138,  170,  219,  193,  0,
+  40,  140,  84,   137,  71,   0,
+  79,  84,   35,   103,  60,   0,
+  136, 49,   36,   132,  62,   0,
+  121, 41,   169,  88,  207,   0,
+
+  155, 228,  45,  28,  158,   0, // row 6
+  129, 92,   100, 49,  184,   0,
+  0,   124,  99,  45,  148,   0,
+  123, 55,   31,  222, 209,   0,
+  109, 87,   107, 133, 139,   0,
+  47,  154,  10,  155, 29,    0,
+  7,   34,   198, 168, 12,    0,
+  137, 72,   172, 124, 56,    0,
+
+  129, 147,  140, 3,   116,   0,  // row 7
+  80,  186,  16,  102, 143,   0,
+  0,   45,   148, 96,  78,    0,
+  103, 13,   105, 150, 181,   0,
+  97,  135,  35,  108, 65,    0,
+  48,  125,  24,  47,  55,    0,
+  163, 78,   143, 107, 58,    0,
+  86,  186,  87,  172, 154,   0,
+
+  142, 94,   230, 0, // row 8
+  118, 70,   152, 0,
+  0,   65,   87,  0,
+  147, 43,   152,  0,
+  70,  69,   88,  0,
+  53,  31,  161,  0,
+  101, 177, 22,   0,
+  176, 169, 225,  0,
+
+  203,  205,  61,  247,  0,  // row 9
+  28,   132,  185, 178,  0,
+  0,    97,   51,  85,   0,
+  2,    30,   184, 83,   0,
+  97,   40,   24,  49,   0,
+  104,  142,  99,  64,   0,
+  186,  27,   205, 81,   0,
+  167,  238,  48,  68,   0,
+
+  11,  185,  0,   117,  0, // row 10
+  59,  104,  22,  52,   0,
+  0,   17,   156, 20,   0,
+  174, 150,  8,   56,   0,
+  46,  41,   101, 96,   0,
+  111, 25,   174, 23,   0,
+  125, 60,   177, 51,   0,
+  38,  217,  208, 232,  0,
+
+  11,  236,  210, 56,   0, // row 11
+  32,  92,   174, 154,  0,
+  0,   7,    4,   2,    0,
+  99,  138,  110, 99,   0,
+  28,  30,   116, 64,   0,
+  91,  175,  24,  141,  0,
+  39,  29,   35,  8,    0,
+  178, 214,  168, 51,   0,
+
+  63,  111,  14,  0,  // row 12
+  39,  93,   11,  0,
+  0,   113,  48,  0,
+  46,  217,  109, 0,
+  33,  122,  131, 0,
+  122, 11,   4,   0,
+  18,  155,  49,  0,
+  124, 122,  72,  0,
+
+  83,  2,   38,  222,  0,  // row 13
+  49,  125, 35,  166,  0,
+  0,   112, 102, 26,   0,
+  37,  113, 143, 140,  0,
+  76,  37,  62,  47,   0,
+  29,  91,  27,  127,  0,
+  32,  53,  95,  186,  0,
+  48,  57,  167, 219,  0,
+
+  115, 145, 3,   232,  0, // row 14
+  19,  118, 21,  163,  0,
+  0,   138, 57,  27,   0,
+  36,  95,  40,  116,  0,
+  143, 51,  130, 97,   0,
+  11,  145, 8,   166,  0,
+  91,  20,  52,  109,  0,
+  82,  232, 204, 162,  0,
+
+  51,  175,  213,  0,  // row 15
+  68,  63,   81,   0,
+  0,   73,   99,   0,
+  116, 200,  110,  0,
+  139, 96,   128,  0,
+  137, 103,  40,   0,
+  174, 108,  102,  0,
+  38,  217,  157,  0,
+
+  203, 142,  8,    242,  0, // row 16
+  87,  177,  135,  64,   0,
+  0,   79,   111,  143,  0,
+  75,  158,  134,  97,   0,
+  48,  9,    28,   8,    0,
+  78,  158,  17,   165,  0,
+  125, 31,   54,   176,  0,
+  170, 23,  175,   202,  0,
+
+  254,  124,  114,  64,  0, // row 17
+  158,  23,   9,    6,   0,
+  0,    24,   109,  18,  0,
+  48,   132,  206,  2,   0,
+  120,  43,   65,   42,  0,
+  134,  23,   62,   163, 0,
+  57,   201,  142,  35,  0,
+  196,  173,  195,  218, 0,
+
+  220,  194,  50,  0,  // row 18
+  186,  6,    46,  0,
+  0,    18,   86,  0,
+  68,   16,   156, 0,
+  17,   106,  142, 0,
+  173,  31,   22,  0,
+  129,  203,  140, 0,
+  128,  211,  210, 0,
+
+  87,   20,   185,  0,  // row 19
+  58,   42,   156,  0,
+  0,   158,   154,  0,
+  35,  138,   86,   0,
+  79,  28,    41,   0,
+  13,  135,   145,  0,
+  110, 124,   52,   0,
+  39,  84,    88,   0,
+
+  26,  105,  29,   0, // row 20
+  76,  61,  153,   0,
+  0,   148, 104,   0,
+  6,   20,  141,   0,
+  2,   103, 78,    0,
+  128, 52,  173,   0,
+  196, 35,  114,   0,
+  117, 227, 6,     0,
+
+  76,  42,  210,  0,  // row 21
+  157, 175, 67,   0,
+  0,   17,  33,   0,
+  80,  43,  81,   0,
+  91,  75,  81,   0,
+  156, 166, 40,   0,
+  10,  122, 23,   0,
+  238,  13,  11,  0,
+
+  222,  63,  0, // row 22
+  20,   52,  0,
+  0,    4,   0,
+  49,   1,   0,
+  54,   132, 0,
+  18,   163, 0,
+  202,  126, 0,
+  195,  44,  0,
+
+  23,   235, 238,  0, // row 23
+  106,  86,  95,   0,
+  0,    75,  158,  0,
+  156,  54,  134,  0,
+  68,   115, 56,   0,
+  110,  132, 150,  0,
+  52,   170, 13,   0,
+  5,    94,  111,  0,
+
+  46,  139,  8,    0, // row 24
+  182, 153,  64,   0,
+  0,   69,   87,   0,
+  153, 88,   63,   0,
+  30,  42,   101,  0,
+  113, 108,  61,   0,
+  113, 161,  88,   0,
+  81,  19,   130,  0,
+
+  228, 156,  0,  // row 25
+  45,  21,   0,
+  0,   65,   0,
+  211, 94,   0,
+  128, 63,   0,
+  72,  136,  0,
+  197, 194,  0,
+  66,  95,   0,
+
+  29,  143,  160,  122, 0, // row 26
+  67,  137,  55,   85,  0,
+  0,   100,  13,   7,   0,
+  90,  6,    221,  6,   0,
+  142, 28,   100,  133, 0,
+  36,  38,   53,   145, 0,
+  164, 172,  49,   161, 0,
+  146, 66,   190,  86,  0,
+
+  8,   151,  0, // row 27
+  103, 50,   0,
+  0,   32,   0,
+  27,  118,  0,
+  13,  10,   0,
+  42,  104,  0,
+  168, 193,  0,
+  64,  181,  0,
+
+  98,  101,  135,  0, // row 28
+  70,  111,  168,  0,
+  0,   126,  110,  0,
+  216, 212,  193,  0,
+  106, 77,   43,   0,
+  64,  24,   149,  0,
+  14,  186,  46,   0,
+  7,   144,  16,   0,
+
+  18,  28,  0, // row 29
+  110, 17,  0,
+  0,   154, 0,
+  108, 61,  0,
+  133, 25,  0,
+  139, 161, 0,
+  50,  27,  0,
+  25,  57,  0,
+
+  71,  240, 9,   84,  0, // row 30
+  120, 154, 52,  56,  0,
+  0,   35,  51,  134, 0,
+  106, 44,  185, 176, 0,
+  87,  56,  104, 70,  0,
+  84,  173, 93,  29,  0,
+  70,  17,  50,  6,   0,
+  37,  139, 221, 17,  0,
+
+  106, 1,   0, // row 31
+  3,   170, 0,
+  0,   20,  0,
+  147, 182, 0,
+  80,  139, 0,
+  117, 148, 0,
+  115, 189, 0,
+  201, 46,  0,
+
+  242,  44,  166,  0, // row 32
+  84,   8,   17,   0,
+  0,    20,  122,  0,
+  108,  21,  110,  0,
+  32,   89,  71,   0,
+  116,  73,  142,  0,
+  110,  0,   163,  0,
+  179,  14,  116,  0,
+
+  132,  164,  235,  0, // row 33
+  165,  179,  124,  0,
+  0,    88,   13,   0,
+  71,   12,   109,  0,
+  135,  6,    2,    0,
+  105,  137,  29,   0,
+  163,  173,  179,  0,
+  46,   2,    106,  0,
+
+  147,  85,   36,   0, // row 34
+  173,  177,  12,   0,
+  0,    19,   78,   0,
+  29,   201,  69,   0,
+  37,   25,   114,  0,
+  11,   41,   162,  0,
+  197,  191,  193,  0,
+  184,  135,  141,  0,
+
+  57,   40,   63,  0, // row 35
+  77,   184,  18,  0,
+  0,    157,  6,   0,
+  91,   165,  55,  0,
+  60,   137,  93,  0,
+  126,  152,  172, 0,
+  157,  167,  181, 0,
+  85,   225,  175, 0,
+
+  140,  38,  154,  0, // row 36
+  25,  151,  170,  0,
+  0,   63,   82,   0,
+  1,   175,  83,   0,
+  121, 129,  26,   0,
+  73,  154,  129,  0,
+  197, 167,  179,  0,
+  178, 112,  106,  0,
+
+  219,  151,  0, // row 37
+  37,   31,   0,
+  0,    144,  0,
+  40,   12,   0,
+  97,   56,   0,
+  167,  38,   0,
+  181,  193,  0,
+  154,  114,  0,
+
+  31,  66,   38,  0,  // row 38
+  84,  151,  190, 0,
+  0,   93,   19,  0,
+  37,  97,   46,  0,
+  1,   70,   1,   0,
+  112, 7,   19,   0,
+  157, 173, 191,  0,
+  42,  41,  105,  0,
+
+  239,  172,  34,  0, // row 39
+  93,   132,  57,  0,
+  0,    24,   138, 0,
+  106,  181,  154, 0,
+  119,  32,   142, 0,
+  109,  6,    105, 0,
+  181,  157,  173, 0,
+  167,  45,   189, 0,
+
+  0,    75,   120, 0, // row 40
+  103,  107,  163, 0,
+  0,    36,   143,  0,
+  98,   35,   36,  0,
+  6,    73,   102, 0,
+  160,  156,  82,  0,
+  193,  163,  179, 0,
+  78,   67,   180, 0,
+
+  129,  229,  118, 0, // row 41
+  147,  7,    60,  0,
+  0,    2,    55,  0,
+  120,  101,  81,  0,
+  48,   47,   19,  0,
+  132,  6,    8,   0,
+  191,  197,  167, 0,
+  53,   215,  230, 0
+};
\ No newline at end of file
diff --git a/test/UpperPHY/LDPC/Decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp
index f28b039..ab6ea27 100644
--- a/test/UpperPHY/LDPC/Decoding/main.cpp
+++ b/test/UpperPHY/LDPC/Decoding/main.cpp
@@ -9,7 +9,6 @@
 #include "utils/bits_to_bytes.hpp"
 
 #include <algorithm>
-#include <arm_neon.h>
 #include <array>
 #include <cstring>
 #include <vector>
-- 
GitLab


From 8e16e31cd02a9f71cf1a5d9e36b058adfd60b101 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Fri, 10 Jan 2025 14:55:16 +0000
Subject: [PATCH 12/20] Implemented requests made in
 https://gitlab.arm.com/networking/ral/-/merge_requests/27. Changes include:
 minimising changes to armral header and CMake files.

---
 CMakeLists.txt                       | 1090 +++++++++++---------------
 armral_acle.cmake.in                 |  136 ----
 armral_hwy.cmake.in                  |  540 +++++++++++++
 src/UpperPHY/CRC/acle/crc_common.hpp |  271 -------
 src/UpperPHY/CRC/crc_common.hpp      |  279 ++++++-
 5 files changed, 1255 insertions(+), 1061 deletions(-)
 delete mode 100644 armral_acle.cmake.in
 delete mode 100644 src/UpperPHY/CRC/acle/crc_common.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90d7265..a01db46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,94 @@ set(ARMRAL_ARCH
     NEON
     CACHE STRING
           "The architecture to build for ('NEON', 'SVE', 'SVE2' or 'HWY')")
-set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE" "SVE2" "HWY")
+set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2")
+
+set(ARMRAL_LIB_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
 
 # Per source file compiler flag overrides/additions
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
@@ -181,6 +268,59 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
   set(JOB_POOL_CONSOLE JOB_POOL console)
 endif()
 
+if(ARMRAL_ARCH STREQUAL "HWY")
+  include(armral_hwy.cmake.in)
+  return()
+endif()
+
+if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # If the optimization flags are already set, don't try and guess what they
+  # should be.
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.5-a+sve2+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.2-a+sve+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "NEON")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8-a+crypto"
+        CACHE INTERNAL "")
+  else()
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # We explicitly set the optimization flags, so just copy those. We still need
+  # to set the appropriate SVE version definition
+  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+else()
+  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+endif()
+
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   set(ARMRAL_COMPILER_FLAGS
       ${ARMRAL_COMPILER_FLAGS}
@@ -210,23 +350,14 @@ else()
   set(ARMRAL_LINKER_FLAGS "")
 endif()
 
-add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
-# The armral library is defined within the include files
-#
-if(ARMRAL_ARCH STREQUAL "HWY")
-  # The armral_utils library will have additional link libraries added within
-  # this include
-  include(armral_hwy.cmake.in)
-else()
-  include(armral_acle.cmake.in)
-endif()
-
+add_library(armral ${ARMRAL_LIB_SOURCES})
 target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
                                       ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
 
+add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
 target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
@@ -351,635 +482,300 @@ if(BUILD_TESTING)
       DEPENDS bench_${BENCH_NAME})
   endfunction()
 
-  # Temporary duplication while porting is in progress to maintain the order of
-  # bench_excel_summary output
-  if(ARMRAL_ARCH STREQUAL "HWY")
-    # cmake-format: off
-    # add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
-    # add_armral_test(matrix_inv_single
-    #                 test/BasicMathFun/MatrixInv/Single/main.cpp)
-    # add_armral_test(arm_solve
-    #                 test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_batch_16
-    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_batch_32
-    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
-    # add_armral_test(matrix_mult_16
-    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
-    # add_armral_test(matrix_mult_32
-    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
-    # add_armral_test(
-    #   matrix_mult_aah_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    # add_armral_test(
-    #   matrix_mult_ahb_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_single_16
-    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_single_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    # add_armral_test(matrix_pseudo_inv_direct
-    #                 test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    # add_armral_test(vec_dot_16
-    #                 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    # add_armral_test(vec_dot_16_2
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    # add_armral_test(vec_dot_16_2_32_bit
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    # add_armral_test(vec_dot_16_32_bit
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    # add_armral_test(vec_dot_32
-    #                 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    # add_armral_test(vec_dot_32_2
-    #                 test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    # add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    # add_armral_test(vec_mul_16_2
-    #                 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    # add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    # add_armral_test(vec_mul_32_2
-    #                 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    # add_armral_test(mu_law_compression
-    #                 test/DuRuInterface/MuLaw/Compression/main.cpp)
-    # add_armral_test(mu_law_decompression
-    #                 test/DuRuInterface/MuLaw/Decompression/main.cpp)
-    # add_armral_test(block_float_compression
-    #                 test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
-    # add_armral_test(block_float_decompression
-    #                 test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
-    # add_armral_test(block_scaling_compression
-    #                 test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
-    # add_armral_test(block_scaling_decompression
-    #                 test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
-    # add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
-    # add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
-    # add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
-    # add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
-    # add_armral_test(arm_fir_filter_cs16_decimate_2
-    #                 test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    # add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
-    # add_armral_test(arm_fir_filter_cf32_decimate_2
-    #                 test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    # add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-    # add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-    # add_armral_test(tail_biting_convolutional_decoding
-    #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
-    # add_armral_test(tail_biting_convolutional_encoding
-    #                 test/UpperPHY/ConvolutionalEncoder/main.cpp)
-    # add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-    # add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-    # add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-    # add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-    # add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-    # add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
-    # add_armral_test(polar_crc_attachment
-    #                 test/UpperPHY/Polar/CrcAttachment/main.cpp)
-    # add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
-    # add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
-    # add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
-    # add_armral_test(polar_rate_matching
-    #                 test/UpperPHY/Polar/RateMatching/main.cpp)
-    # add_armral_test(polar_rate_recovery
-    #                 test/UpperPHY/Polar/RateRecovery/main.cpp)
-    # add_armral_test(polar_subchannel_deinterleave
-    #                 test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    # add_armral_test(polar_subchannel_interleave
-    #                 test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    # add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-    # add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-    # add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-    # add_armral_test(turbo_rate_matching
-    #                 test/UpperPHY/Turbo/RateMatching/main.cpp)
-    # add_armral_test(turbo_rate_recovery
-    #                 test/UpperPHY/Turbo/RateRecovery/main.cpp)
-    # add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
-
-    # add_armral_bench(
-    #   matrix_inv_batch_general
-    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_general_pa
-    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_hermitian
-    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_hermitian_pa
-    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-    # add_armral_bench(matrix_inv_single_general
-    #                  bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_single_hermitian
-    #   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
-    # add_armral_bench(arm_solve_1x2
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-    # add_armral_bench(arm_solve_1x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-    # add_armral_bench(arm_solve_2x2
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-    # add_armral_bench(arm_solve_2x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-    # add_armral_bench(arm_solve_4x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_32b_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_64b_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_f32
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_f32_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_2x2_iq
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_2x2
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_4x4_iq
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_4x4
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-    # add_armral_bench(
-    #   matmul_f32_general
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_aah_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_ahb_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    # add_armral_bench(matrix_pseudo_inv_direct
-    #                  bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    # add_armral_bench(vec_dot_16
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    # add_armral_bench(vec_dot_16_2
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    # add_armral_bench(vec_dot_16_2_32_bit
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    # add_armral_bench(vec_dot_16_32_bit
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    # add_armral_bench(vec_dot_32
-    #                  bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    # add_armral_bench(vec_dot_32_2
-    #                  bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    # add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    # add_armral_bench(vec_mul_16_2
-    #                  bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    # add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    # add_armral_bench(vec_mul_32_2
-    #                  bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    # add_armral_bench(mu_law_compression_14bit
-    #                  bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
-    # add_armral_bench(mu_law_compression_8bit
-    #                  bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
-    # add_armral_bench(mu_law_compression_9bit
-    #                  bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_14bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_8bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_9bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_12bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_14bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_8bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_9bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_12bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_14bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_8bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_9bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_14bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_8bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_9bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_14bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_8bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_9bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
-    # add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
-    # add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
-    # add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
-    # add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
-    # add_armral_bench(arm_fir_filter_cs16_decimate_2
-    #                  bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    # add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
-    # add_armral_bench(arm_fir_filter_cf32_decimate_2
-    #                  bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    # add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-    # add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-    # add_armral_bench(tail_biting_convolutional_decoding
-    #                  bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-    # add_armral_bench(tail_biting_convolutional_encoding
-    #                  bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-    # add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-    # add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-    # add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-    # add_armral_bench(ldpc_rate_matching
-    #                  bench/UpperPHY/LDPC/RateMatching/main.cpp)
-    # add_armral_bench(ldpc_rate_recovery
-    #                  bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-    # add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
-    # add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
-    # add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
-    # add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
-    # add_armral_bench(polar_rate_matching
-    #                  bench/UpperPHY/Polar/RateMatching/main.cpp)
-    # add_armral_bench(polar_rate_recovery
-    #                  bench/UpperPHY/Polar/RateRecovery/main.cpp)
-    # add_armral_bench(polar_subchannel_deinterleave
-    #                  bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    # add_armral_bench(polar_subchannel_interleave
-    #                  bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    # add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-    # add_armral_bench(turbo_rate_matching
-    #                  bench/UpperPHY/Turbo/RateMatching/main.cpp)
-    # add_armral_bench(turbo_rate_recovery
-    #                  bench/UpperPHY/Turbo/RateRecovery/main.cpp)
-    # add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
-    # cmake-format: on
-  else()
-    add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
-    add_armral_test(matrix_inv_single
-                    test/BasicMathFun/MatrixInv/Single/main.cpp)
-    add_armral_test(arm_solve
-                    test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_batch_16
-      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_batch_32
-      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
-    add_armral_test(matrix_mult_16
-                    test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
-    add_armral_test(matrix_mult_32
-                    test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
-    add_armral_test(
-      matrix_mult_aah_32
-      test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    add_armral_test(
-      matrix_mult_ahb_32
-      test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_single_16
-      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_single_32
-      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    add_armral_test(matrix_pseudo_inv_direct
-                    test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    add_armral_test(vec_dot_16
-                    test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    add_armral_test(vec_dot_16_2
-                    test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    add_armral_test(vec_dot_16_2_32_bit
-                    test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    add_armral_test(vec_dot_16_32_bit
-                    test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    add_armral_test(vec_dot_32
-                    test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    add_armral_test(vec_dot_32_2
-                    test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    add_armral_test(vec_mul_16_2
-                    test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    add_armral_test(vec_mul_32_2
-                    test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    add_armral_test(mu_law_compression
-                    test/DuRuInterface/MuLaw/Compression/main.cpp)
-    add_armral_test(mu_law_decompression
-                    test/DuRuInterface/MuLaw/Decompression/main.cpp)
-    add_armral_test(block_float_compression
-                    test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
-    add_armral_test(block_float_decompression
-                    test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
-    add_armral_test(block_scaling_compression
-                    test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
-    add_armral_test(block_scaling_decompression
-                    test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
-    add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
-    add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
-    add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
-    add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
-    add_armral_test(arm_fir_filter_cs16_decimate_2
-                    test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
-    add_armral_test(arm_fir_filter_cf32_decimate_2
-                    test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-    add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-    add_armral_test(tail_biting_convolutional_decoding
-                    test/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_test(tail_biting_convolutional_encoding
-                    test/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-    add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
-    add_armral_test(polar_crc_attachment
-                    test/UpperPHY/Polar/CrcAttachment/main.cpp)
-    add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
-    add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
-    add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
-    add_armral_test(polar_rate_matching
-                    test/UpperPHY/Polar/RateMatching/main.cpp)
-    add_armral_test(polar_rate_recovery
-                    test/UpperPHY/Polar/RateRecovery/main.cpp)
-    add_armral_test(polar_subchannel_deinterleave
-                    test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    add_armral_test(polar_subchannel_interleave
-                    test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-    add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-    add_armral_test(turbo_rate_matching
-                    test/UpperPHY/Turbo/RateMatching/main.cpp)
-    add_armral_test(turbo_rate_recovery
-                    test/UpperPHY/Turbo/RateRecovery/main.cpp)
-    add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
-
-    add_armral_bench(
-      matrix_inv_batch_general
-      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_general_pa
-      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_hermitian
-      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_hermitian_pa
-      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-    add_armral_bench(matrix_inv_single_general
-                     bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
-    add_armral_bench(
-      matrix_inv_single_hermitian
-      bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
-    add_armral_bench(arm_solve_1x2
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-    add_armral_bench(arm_solve_1x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-    add_armral_bench(arm_solve_2x2
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-    add_armral_bench(arm_solve_2x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-    add_armral_bench(arm_solve_4x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_32b
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_32b_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_64b
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_64b_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_f32
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_f32_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-    add_armral_bench(
-      matrix_mult_i16_32b
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-    add_armral_bench(
-      matrix_mult_i16_64b
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_2x2_iq
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_2x2
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_4x4_iq
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_4x4
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-    add_armral_bench(
-      matmul_f32_general
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
-    add_armral_bench(
-      matrix_mult_aah_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    add_armral_bench(
-      matrix_mult_ahb_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_i16_32b
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_i16_64b
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    add_armral_bench(matrix_pseudo_inv_direct
-                     bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    add_armral_bench(vec_dot_16
-                     bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    add_armral_bench(vec_dot_16_2
-                     bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    add_armral_bench(vec_dot_16_2_32_bit
-                     bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    add_armral_bench(vec_dot_16_32_bit
-                     bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    add_armral_bench(vec_dot_32
-                     bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    add_armral_bench(vec_dot_32_2
-                     bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    add_armral_bench(vec_mul_16_2
-                     bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    add_armral_bench(vec_mul_32_2
-                     bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    add_armral_bench(mu_law_compression_14bit
-                     bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
-    add_armral_bench(mu_law_compression_8bit
-                     bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
-    add_armral_bench(mu_law_compression_9bit
-                     bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
-    add_armral_bench(mu_law_decompression_14bit
-                     bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
-    add_armral_bench(mu_law_decompression_8bit
-                     bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
-    add_armral_bench(mu_law_decompression_9bit
-                     bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_12bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_14bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_8bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_9bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_12bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_14bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_8bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_9bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_14bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_8bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_9bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_14bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_8bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_9bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
-    add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
-    add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
-    add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
-    add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
-    add_armral_bench(arm_fir_filter_cs16_decimate_2
-                     bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
-    add_armral_bench(arm_fir_filter_cf32_decimate_2
-                     bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-    add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-    add_armral_bench(tail_biting_convolutional_decoding
-                     bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_bench(tail_biting_convolutional_encoding
-                     bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-    add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_bench(ldpc_rate_matching
-                     bench/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_bench(ldpc_rate_recovery
-                     bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
-    add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
-    add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
-    add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
-    add_armral_bench(polar_rate_matching
-                     bench/UpperPHY/Polar/RateMatching/main.cpp)
-    add_armral_bench(polar_rate_recovery
-                     bench/UpperPHY/Polar/RateRecovery/main.cpp)
-    add_armral_bench(polar_subchannel_deinterleave
-                     bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    add_armral_bench(polar_subchannel_interleave
-                     bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-    add_armral_bench(turbo_rate_matching
-                     bench/UpperPHY/Turbo/RateMatching/main.cpp)
-    add_armral_bench(turbo_rate_recovery
-                     bench/UpperPHY/Turbo/RateRecovery/main.cpp)
-    add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
-  endif()
+  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+  add_armral_test(arm_solve
+                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_16
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_32
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_mult_16
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+  add_armral_test(matrix_mult_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+  add_armral_test(matrix_mult_aah_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_test(matrix_mult_ahb_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_16
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_32
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_pseudo_inv_direct
+                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_test(vec_dot_16_2
+                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_test(vec_dot_16_2_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_test(vec_dot_16_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_test(vec_dot_32_2
+                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_test(mu_law_compression
+                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+  add_armral_test(mu_law_decompression
+                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+  add_armral_test(block_float_compression
+                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+  add_armral_test(block_float_decompression
+                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+  add_armral_test(block_scaling_compression
+                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+  add_armral_test(block_scaling_decompression
+                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_test(arm_fir_filter_cs16_decimate_2
+                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_test(arm_fir_filter_cf32_decimate_2
+                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+  add_armral_test(tail_biting_convolutional_decoding
+                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_test(tail_biting_convolutional_encoding
+                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+  add_armral_test(polar_crc_attachment
+                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_test(polar_subchannel_deinterleave
+                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_test(polar_subchannel_interleave
+                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+  add_armral_bench(
+    matrix_inv_batch_general
+    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+  add_armral_bench(matrix_inv_batch_general_pa
+                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian_pa
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+  add_armral_bench(matrix_inv_single_general
+                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+  add_armral_bench(matrix_inv_single_hermitian
+                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+  add_armral_bench(arm_solve_1x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+  add_armral_bench(arm_solve_1x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+  add_armral_bench(arm_solve_2x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+  add_armral_bench(arm_solve_2x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+  add_armral_bench(arm_solve_4x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+  add_armral_bench(
+    matmul_f32_general
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+  add_armral_bench(
+    matrix_mult_aah_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_bench(
+    matrix_mult_ahb_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_bench(matrix_pseudo_inv_direct
+                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_bench(vec_dot_16
+                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_bench(vec_dot_16_2
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_bench(vec_dot_16_2_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_bench(vec_dot_16_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_bench(vec_dot_32
+                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_bench(vec_dot_32_2
+                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_bench(vec_mul_16_2
+                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_bench(vec_mul_32_2
+                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_bench(mu_law_compression_14bit
+                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+  add_armral_bench(mu_law_compression_8bit
+                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+  add_armral_bench(mu_law_compression_9bit
+                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+  add_armral_bench(mu_law_decompression_14bit
+                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+  add_armral_bench(mu_law_decompression_8bit
+                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+  add_armral_bench(mu_law_decompression_9bit
+                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+  add_armral_bench(block_float_compression_8bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+  add_armral_bench(block_float_compression_9bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_8bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_9bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16_decimate_2
+                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32_decimate_2
+                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+  add_armral_bench(tail_biting_convolutional_decoding
+                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_bench(tail_biting_convolutional_encoding
+                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_bench(polar_rate_matching
+                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_bench(polar_rate_recovery
+                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_bench(polar_subchannel_deinterleave
+                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_bench(polar_subchannel_interleave
+                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_bench(turbo_rate_matching
+                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_bench(turbo_rate_recovery
+                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
 endif()
 
 if(BUILD_EXAMPLES)
@@ -1009,15 +805,13 @@ if(BUILD_EXAMPLES)
     add_dependencies(run_examples run_${EXAMPLE_EXE})
   endfunction()
 
-  if(NOT ARMRAL_ARCH STREQUAL "HWY")
-    add_armral_example(examples/block_float_9b_example.c)
-    add_armral_example(examples/fft_cf32_example.c 10)
-    add_armral_example(examples/modulation_example.c)
-    add_armral_example(examples/polar_example.cpp 128 100 35)
-  endif()
+  add_armral_example(examples/block_float_9b_example.c)
+  add_armral_example(examples/fft_cf32_example.c 10)
+  add_armral_example(examples/modulation_example.c)
+  add_armral_example(examples/polar_example.cpp 128 100 35)
 endif()
 
-if(BUILD_SIMULATION AND NOT (ARMRAL_ARCH STREQUAL "HWY"))
+if(BUILD_SIMULATION)
   # Include simulation rules and targets This involves building dependencies
   # like AWGN library and OpenMP
   add_subdirectory(simulation)
diff --git a/armral_acle.cmake.in b/armral_acle.cmake.in
deleted file mode 100644
index d1e9c0d..0000000
--- a/armral_acle.cmake.in
+++ /dev/null
@@ -1,136 +0,0 @@
-if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # If the optimization flags are already set, don't try and guess what they
-  # should be.
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.5-a+sve2+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.2-a+sve+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "NEON")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8-a+crypto"
-        CACHE INTERNAL "")
-  else()
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # We explicitly set the optimization flags, so just copy those. We still need
-  # to set the appropriate SVE version definition
-  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-else()
-  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-endif()
-
-set(ARMRAL_LIB_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
-
-add_library(armral ${ARMRAL_LIB_SOURCES})
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index 502cdac..6808b65 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -149,7 +149,547 @@ set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp
 )
 
+if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  set(ARMRAL_COMPILER_FLAGS
+      ${ARMRAL_COMPILER_FLAGS}
+      $<$<COMPILE_LANGUAGE:C>:-Wshadow
+      -Wall
+      -Wcast-qual>
+      $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+      -Wall
+      -Wcast-qual
+      -fno-rtti
+      -fno-exceptions
+      -std=c++17>
+      $<$<CONFIG:DEBUG>:-Og
+      -g3
+      -ggdb
+      -fno-omit-frame-pointer>)
+  # Disable GLIBCXX assertions to avoid introducing dependency on libstdc++
+  add_definitions(-D_GLIBCXX_NO_ASSERTIONS)
+  message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}")
+else()
+  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the
+  # compile line
+  message(STATUS "Overriding compilation flags with manually set flags")
+  message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+  set(ARMRAL_COMPILER_FLAGS "")
+  set(ARMRAL_LINKER_FLAGS "")
+endif()
+
 add_library(armral ${ARMRAL_LIB_SOURCES})
+target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
+target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
+target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                      ${ARMRAL_COMPILER_FLAGS})
+target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
+
+add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
+target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
+target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
+target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                            ${ARMRAL_COMPILER_FLAGS})
+target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS})
 
 target_link_libraries(armral PUBLIC hwy)
 target_link_libraries(armral_utils PUBLIC hwy)
+
+if(ARMRAL_SEMIHOSTING)
+  # When semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag,
+  # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable
+  # ARMRAL_SEMIHOSTING
+  target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING")
+  target_compile_definitions(armral_utils PUBLIC "ARMRAL_SEMIHOSTING")
+endif()
+
+include(GNUInstallDirs)
+install(TARGETS armral DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(
+  DIRECTORY include/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING
+  PATTERN "*.h")
+install(FILES LICENSE.md THIRD_PARTY_LICENSES.md
+        DESTINATION ${CMAKE_INSTALL_DATADIR}/licenses/armral)
+
+if(BUILD_TESTING)
+  include(CTest)
+
+  if(NOT DEFINED BENCHMARKER_SOURCE_DIR)
+    set(BENCHMARKER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+  if(NOT DEFINED BENCHMARKER_BUILD_DIR)
+    set(BENCHMARKER_BUILD_DIR ${CMAKE_BINARY_DIR})
+  endif()
+  if(NOT DEFINED BENCHMARKER_RUNNER)
+    set(BENCHMARKER_RUNNER "${BENCHMARKER_SOURCE_DIR}/bench/default_runner.py")
+  endif()
+
+  add_custom_target(
+    check
+    COMMAND ${CMAKE_CTEST_COMMAND}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_custom_target(
+    bench
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER}
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_concurrent
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_excel_summary
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} | tee
+      ${BENCHMARKER_BUILD_DIR}/out.json
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py
+            ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  set(ARMRAL_TEST_LINK_LIBRARIES armral armral_utils)
+
+  if(STATIC_TESTING)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -static)
+  endif()
+
+  # Utility function to add a test
+  function(add_armral_test TEST_NAME TEST_SOURCE)
+    # Build the actual test executable itself
+    add_executable(${TEST_NAME} ${TEST_SOURCE})
+    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
+    target_include_directories(${TEST_NAME} PRIVATE ${ARMRAL_TEST_INC})
+    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}
+                                                ${ARMRAL_ARCH_COMPILE_OPTIONS})
+
+    # Register it as a test, set up dependencies
+    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER}
+                                       ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
+    if(ARMRAL_ENABLE_ASAN)
+      # Avoid slow-downs in newer versions of Address Santizier
+      # https://github.com/llvm/llvm-project/issues/64190
+      set_tests_properties(
+        ${TEST_NAME} PROPERTIES ENVIRONMENT
+                                "ASAN_OPTIONS=detect_stack_use_after_return=0")
+    endif()
+    add_dependencies(check ${TEST_NAME})
+  endfunction()
+
+  # Utility function to add a benchmark
+  function(add_armral_bench BENCH_NAME BENCH_SOURCE)
+
+    # Build the actual bench executable itself
+    add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE})
+    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
+    target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC})
+    target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS})
+
+    # Register it as a benchmark, set up dependencies
+    add_dependencies(bench bench_${BENCH_NAME})
+    add_dependencies(bench_concurrent bench_${BENCH_NAME})
+    add_dependencies(bench_excel_summary bench_${BENCH_NAME})
+
+    # Add target for running the benchmark
+    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
+    add_custom_target(
+      run_bench_${BENCH_NAME}
+      COMMAND
+        ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR} ${BENCHMARKER_BUILD_DIR}
+        --runner ${BENCHMARKER_RUNNER} --concurrent ${JOB_POOL_CONSOLE}
+      WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}
+      DEPENDS bench_${BENCH_NAME})
+  endfunction()
+
+  # cmake-format: off
+#  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+#  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+#  add_armral_test(arm_solve
+#                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_batch_16
+#    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_batch_32
+#    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+#  add_armral_test(matrix_mult_16
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+#  add_armral_test(matrix_mult_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+#  add_armral_test(matrix_mult_aah_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+#  add_armral_test(matrix_mult_ahb_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_single_16
+#    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_single_32
+#    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+#  add_armral_test(matrix_pseudo_inv_direct
+#                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+#  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+#  add_armral_test(vec_dot_16_2
+#                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+#  add_armral_test(vec_dot_16_2_32_bit
+#                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+#  add_armral_test(vec_dot_16_32_bit
+#                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+#  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+#  add_armral_test(vec_dot_32_2
+#                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+#  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+#  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+#  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+#  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+#  add_armral_test(mu_law_compression
+#                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+#  add_armral_test(mu_law_decompression
+#                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+#  add_armral_test(block_float_compression
+#                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+#  add_armral_test(block_float_decompression
+#                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+#  add_armral_test(block_scaling_compression
+#                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+#  add_armral_test(block_scaling_decompression
+#                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+#  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+#  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+#  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+#  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+#  add_armral_test(arm_fir_filter_cs16_decimate_2
+#                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+#  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+#  add_armral_test(arm_fir_filter_cf32_decimate_2
+#                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+#  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+#  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+#  add_armral_test(tail_biting_convolutional_decoding
+#                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+#  add_armral_test(tail_biting_convolutional_encoding
+#                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+#  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+#  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+#  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+#  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+#  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+#  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+#  add_armral_test(polar_crc_attachment
+#                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+#  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+#  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+#  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+#  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+#  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+#  add_armral_test(polar_subchannel_deinterleave
+#                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+#  add_armral_test(polar_subchannel_interleave
+#                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+#  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+#  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+#  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+#  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+#  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+#  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+#
+#  add_armral_bench(
+#    matrix_inv_batch_general
+#    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+#  add_armral_bench(matrix_inv_batch_general_pa
+#                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_inv_batch_hermitian
+#    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_inv_batch_hermitian_pa
+#    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+#  add_armral_bench(matrix_inv_single_general
+#                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+#  add_armral_bench(matrix_inv_single_hermitian
+#                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+#  add_armral_bench(arm_solve_1x2
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+#  add_armral_bench(arm_solve_1x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+#  add_armral_bench(arm_solve_2x2
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+#  add_armral_bench(arm_solve_2x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+#  add_armral_bench(arm_solve_4x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_32b
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_32b_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_64b
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_64b_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_f32
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_f32_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_i16_32b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_i16_64b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_2x2_iq
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_2x2
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_4x4_iq
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_4x4
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+#  add_armral_bench(
+#    matmul_f32_general
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_aah_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_ahb_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_i16_32b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_i16_64b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+#  add_armral_bench(matrix_pseudo_inv_direct
+#                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+#  add_armral_bench(vec_dot_16
+#                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+#  add_armral_bench(vec_dot_16_2
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+#  add_armral_bench(vec_dot_16_2_32_bit
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+#  add_armral_bench(vec_dot_16_32_bit
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+#  add_armral_bench(vec_dot_32
+#                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+#  add_armral_bench(vec_dot_32_2
+#                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+#  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+#  add_armral_bench(vec_mul_16_2
+#                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+#  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+#  add_armral_bench(vec_mul_32_2
+#                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+#  add_armral_bench(mu_law_compression_14bit
+#                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+#  add_armral_bench(mu_law_compression_8bit
+#                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+#  add_armral_bench(mu_law_compression_9bit
+#                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_14bit
+#                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_8bit
+#                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_9bit
+#                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_float_compression_12bit
+#    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+#  add_armral_bench(
+#    block_float_compression_14bit
+#    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+#  add_armral_bench(block_float_compression_8bit
+#                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+#  add_armral_bench(block_float_compression_9bit
+#                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_12bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_14bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_8bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_9bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_14bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_8bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_9bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_14bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_8bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_9bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+#  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+#  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+#  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+#  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+#  add_armral_bench(arm_fir_filter_cs16_decimate_2
+#                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+#  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+#  add_armral_bench(arm_fir_filter_cf32_decimate_2
+#                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+#  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+#  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+#  add_armral_bench(tail_biting_convolutional_decoding
+#                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+#  add_armral_bench(tail_biting_convolutional_encoding
+#                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+#  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+#  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+#  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+#  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+#  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+#  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+#  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+#  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+#  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+#  add_armral_bench(polar_rate_matching
+#                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+#  add_armral_bench(polar_rate_recovery
+#                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+#  add_armral_bench(polar_subchannel_deinterleave
+#                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+#  add_armral_bench(polar_subchannel_interleave
+#                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+#  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+#  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+#  add_armral_bench(turbo_rate_matching
+#                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+#  add_armral_bench(turbo_rate_recovery
+#                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+#  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+# cmake-format: on
+endif()
+
+if(BUILD_EXAMPLES)
+  add_custom_target(make_examples_dir ALL COMMAND ${CMAKE_COMMAND} -E
+                                                  make_directory examples)
+  add_custom_target(examples)
+  add_custom_target(run_examples)
+  add_dependencies(run_examples examples)
+
+  # Any parameters after the first one will be passed as parameters to the
+  # example executable when running it
+  function(add_armral_example EXAMPLE_SOURCE)
+    get_filename_component(EXAMPLE_EXE ${EXAMPLE_SOURCE} NAME_WE)
+    add_executable(${EXAMPLE_EXE} ${EXAMPLE_SOURCE})
+    add_dependencies(${EXAMPLE_EXE} make_examples_dir)
+    set(EXAMPLE_OUTPUT_NAME examples/${EXAMPLE_EXE})
+    set_target_properties(${EXAMPLE_EXE} PROPERTIES OUTPUT_NAME
+                                                    ${EXAMPLE_OUTPUT_NAME})
+
+    target_link_libraries(${EXAMPLE_EXE} armral m)
+
+    add_custom_target(
+      run_${EXAMPLE_EXE}
+      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
+      DEPENDS ${EXAMPLE_EXE})
+    add_dependencies(examples ${EXAMPLE_EXE})
+    add_dependencies(run_examples run_${EXAMPLE_EXE})
+  endfunction()
+
+  # cmake-format: off
+#  add_armral_example(examples/block_float_9b_example.c)
+#  add_armral_example(examples/fft_cf32_example.c 10)
+#  add_armral_example(examples/modulation_example.c)
+#  add_armral_example(examples/polar_example.cpp 128 100 35)
+# cmake-format: on
+endif()
+
+# if(BUILD_SIMULATION) # Include simulation rules and targets This involves
+# building dependencies # like AWGN library and OpenMP
+# add_subdirectory(simulation) endif()
+
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+  set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)
+  set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
+  configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY)
+  add_custom_target(docs COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT})
+  install(
+    DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs/html
+    DESTINATION ${CMAKE_INSTALL_DOCDIR}
+    OPTIONAL)
+endif()
+
+# Create target to uninstall the library
+if(NOT TARGET uninstall)
+  configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
+
+  add_custom_target(
+    uninstall COMMAND ${CMAKE_COMMAND} -P
+                      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+endif()
+
+# Check that the C and C++ compilers are from the same toolchain
+if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
+  message(
+    FATAL_ERROR
+      "CXX and C compiler providers differ. Please specify the same compiler toolchain"
+  )
+endif()
+
+set(COMP_ERR_MSG
+    "Compilation is only supported with GNU versions 7, 8, 9, 10, 11, 12, 13, 14. \
+                  If compilation fails please use one of the supported compilers."
+)
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION
+                                                  VERSION_GREATER 14.2)
+    message(WARNING ${COMP_ERR_MSG})
+  endif()
+else()
+  message(WARNING ${COMP_ERR_MSG})
+endif()
diff --git a/src/UpperPHY/CRC/acle/crc_common.hpp b/src/UpperPHY/CRC/acle/crc_common.hpp
deleted file mode 100644
index 47bf69e..0000000
--- a/src/UpperPHY/CRC/acle/crc_common.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
-    Arm RAN Acceleration Library
-    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-*/
-#pragma once
-
-#include <arm_neon.h>
-
-static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
-  // Sometimes compilers don't realize that they don't need an extra
-  // instruction to extract the 0th lane of a vector, e.g. when doing
-  // vmull_p64(a[0], b[0]), so this just gets around that.
-  poly128_t res;
-  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
-  // If vmull_high_p64 is used, then clang might use a mov to general
-  // purpose registers and back follow by a pmull. This forces the use
-  // of a single pmull2 instruction instead.
-  poly128_t res;
-  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_dup_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
-  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
-  // 1. It isn't available on the earliest GCC version we currently support
-  // 2. If GCC recognizes that this is an associative operation, then it tries
-  //    to optimize the operation tree in its tree-reassoc pass, but it
-  //    actually makes the performance much worse. Hiding it in assembly means
-  //    that the compiler uses our carefully balanced operation tree instead.
-  uint8x16_t res;
-  asm("eor %0.16b, %1.16b, %2.16b"
-      : "=w"(res)
-      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
-  return (poly64x2_t)res;
-}
-
-/**
- * Computes a CRC64 in big- or little-endian mode using the specified shifts
- * and polynomials. This can be used for smaller polynomials by shifting
- * them to a degree 64 polynomial.
- *
- * @tparam     BarretShift     the shift used when computing @c ls1_divp.
- * @param[in]  size            number of bytes of the given buffer
- * @param[in]  input           points to the input byte sequence
- * @param[out] crc             the computed CRC
- * @param[in]  constants       the constants specific to each polynomial:
-                               constants[0] = padding
-                               constants[1] = (1<<128) / P_CRC - (1<<64)
-                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
-                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
- */
-template<char Endianness>
-static inline __attribute__((always_inline)) void
-crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
-      const poly64_t constants[]) {
-  const poly64_t *p_in = (const poly64_t *)input;
-
-  if (size == 8) {
-    // Special case for <=64 bits
-    poly64x2_t divp_p = vld1q_p64(&constants[1]);
-
-    // This might compile to a separate ldr and dup, which is
-    // fine because the operation using the upper half depends
-    // on the output of the operation using the lower half.
-    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
-    vb = add_p64x2(vb, v11);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load constants for size = 16
-  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
-  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
-  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
-
-  if (size == 16) {
-    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-    poly64x2_t vx1 = add_p64x2(v01, v21);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load the rest of the constants
-  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
-  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
-  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
-
-  if (size == 32) {
-    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
-    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
-    poly64x2_t v01 = add_p64x2(v01a, v01e);
-    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
-  uint32_t init_bytes = size % 64;
-  const poly64_t *p_end = p_in + (size - 16) / 8;
-
-  // These values are carried forwards to the next loop iteration each time.
-  poly64x2_t v01;
-
-  if (init_bytes == 16) {
-    v01 = vdupq_n_p64(0);
-    p_in += 8;
-  } else if (init_bytes == 32) {
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
-    p_in += 10;
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01 = add_p64x2(v01a, v01e);
-  } else if (init_bytes == 48) {
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
-    p_in += 12;
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-
-  } else {
-    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
-    p_in += 14;
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
-
-  if (size <= 64) {
-    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
-  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
-  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
-
-  while (p_in < p_end) {
-    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t vx9 = add_p64x2(v01, v19);
-    poly64x2_t v8x = add_p64x2(v87, v01);
-
-    v19 = load_p64x2<Endianness>(p_in);
-    v87 = load_p64x2<Endianness>(p_in + 2);
-
-    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-    v01b = add_p64x2(v01b, v01bb);
-
-    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-
-    v65 = load_p64x2<Endianness>(p_in + 4);
-    v43 = load_p64x2<Endianness>(p_in + 6);
-    p_in += 8;
-
-    v01a = add_p64x2(v01a, v01aa);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-
-  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-  poly64x2_t vx9 = add_p64x2(v01, v19);
-  poly64x2_t v8x = add_p64x2(v87, v01);
-
-  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-  v01f = add_p64x2(v01f, v01ff);
-  v01e = add_p64x2(v01e, v01ee);
-  v01e = add_p64x2(v01e, v01f);
-
-  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-
-  v01c = add_p64x2(v01c, v01d);
-  v01a = add_p64x2(v01a, v01b);
-  v01e = add_p64x2(v01e, v01g);
-  v01a = add_p64x2(v01a, v01c);
-  v01 = add_p64x2(v01a, v01e);
-
-  poly64x2_t vx1 = add_p64x2(v01, v21);
-
-  // Barret reduction
-  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-  vb = add_p64x2(vb, vx1);
-  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-  v0x = add_p64x2(v0x, v01);
-  *crc = (uint64_t)(v0x[0]);
-}
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index e65d27b..e97b466 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,11 +1,278 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-   Cambridge Consultants Project Reference P5851
 */
-#ifndef ARMRAL_ARCH_HWY
-#include "acle/crc_common.hpp"
-#else
+
+#pragma once
+
+#ifdef ARMRAL_ARCH_HWY
 #include "highway/crc_common.hpp"
-#endif
+#else
+
+#include <arm_neon.h>
+
+static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
+  // Sometimes compilers don't realize that they don't need an extra
+  // instruction to extract the 0th lane of a vector, e.g. when doing
+  // vmull_p64(a[0], b[0]), so this just gets around that.
+  poly128_t res;
+  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
+  // If vmull_high_p64 is used, then clang might use a mov to general
+  // purpose registers and back follow by a pmull. This forces the use
+  // of a single pmull2 instruction instead.
+  poly128_t res;
+  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_dup_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
+  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
+  // 1. It isn't available on the earliest GCC version we currently support
+  // 2. If GCC recognizes that this is an associative operation, then it tries
+  //    to optimize the operation tree in its tree-reassoc pass, but it
+  //    actually makes the performance much worse. Hiding it in assembly means
+  //    that the compiler uses our carefully balanced operation tree instead.
+  uint8x16_t res;
+  asm("eor %0.16b, %1.16b, %2.16b"
+      : "=w"(res)
+      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
+  return (poly64x2_t)res;
+}
+
+/**
+ * Computes a CRC64 in big- or little-endian mode using the specified shifts
+ * and polynomials. This can be used for smaller polynomials by shifting
+ * them to a degree 64 polynomial.
+ *
+ * @tparam     BarretShift     the shift used when computing @c ls1_divp.
+ * @param[in]  size            number of bytes of the given buffer
+ * @param[in]  input           points to the input byte sequence
+ * @param[out] crc             the computed CRC
+ * @param[in]  constants       the constants specific to each polynomial:
+                               constants[0] = padding
+                               constants[1] = (1<<128) / P_CRC - (1<<64)
+                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
+                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
+ */
+template<char Endianness>
+static inline __attribute__((always_inline)) void
+crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
+      const poly64_t constants[]) {
+  const poly64_t *p_in = (const poly64_t *)input;
+
+  if (size == 8) {
+    // Special case for <=64 bits
+    poly64x2_t divp_p = vld1q_p64(&constants[1]);
+
+    // This might compile to a separate ldr and dup, which is
+    // fine because the operation using the upper half depends
+    // on the output of the operation using the lower half.
+    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
+    vb = add_p64x2(vb, v11);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load constants for size = 16
+  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
+  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
+  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
+
+  if (size == 16) {
+    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+    poly64x2_t vx1 = add_p64x2(v01, v21);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load the rest of the constants
+  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
+  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
+  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
+
+  if (size == 32) {
+    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
+    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
+    poly64x2_t v01 = add_p64x2(v01a, v01e);
+    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
+  uint32_t init_bytes = size % 64;
+  const poly64_t *p_end = p_in + (size - 16) / 8;
+
+  // These values are carried forwards to the next loop iteration each time.
+  poly64x2_t v01;
+
+  if (init_bytes == 16) {
+    v01 = vdupq_n_p64(0);
+    p_in += 8;
+  } else if (init_bytes == 32) {
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
+    p_in += 10;
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01 = add_p64x2(v01a, v01e);
+  } else if (init_bytes == 48) {
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
+    p_in += 12;
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+
+  } else {
+    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
+    p_in += 14;
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
+
+  if (size <= 64) {
+    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
+  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
+  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
+
+  while (p_in < p_end) {
+    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t vx9 = add_p64x2(v01, v19);
+    poly64x2_t v8x = add_p64x2(v87, v01);
+
+    v19 = load_p64x2<Endianness>(p_in);
+    v87 = load_p64x2<Endianness>(p_in + 2);
+
+    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+    v01b = add_p64x2(v01b, v01bb);
+
+    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+
+    v65 = load_p64x2<Endianness>(p_in + 4);
+    v43 = load_p64x2<Endianness>(p_in + 6);
+    p_in += 8;
+
+    v01a = add_p64x2(v01a, v01aa);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+
+  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+  poly64x2_t vx9 = add_p64x2(v01, v19);
+  poly64x2_t v8x = add_p64x2(v87, v01);
+
+  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+  v01f = add_p64x2(v01f, v01ff);
+  v01e = add_p64x2(v01e, v01ee);
+  v01e = add_p64x2(v01e, v01f);
+
+  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+
+  v01c = add_p64x2(v01c, v01d);
+  v01a = add_p64x2(v01a, v01b);
+  v01e = add_p64x2(v01e, v01g);
+  v01a = add_p64x2(v01a, v01c);
+  v01 = add_p64x2(v01a, v01e);
+
+  poly64x2_t vx1 = add_p64x2(v01, v21);
+
+  // Barret reduction
+  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+  vb = add_p64x2(vb, vx1);
+  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+  v0x = add_p64x2(v0x, v01);
+  *crc = (uint64_t)(v0x[0]);
+}
+
+#endif
\ No newline at end of file
-- 
GitLab


From f9b23a15c8f7beead6705fac7cb9af25b0699158 Mon Sep 17 00:00:00 2001
From: Finlay Smyth <finlay.smyth@cambridgeconsultants.com>
Date: Mon, 13 Jan 2025 12:06:18 +0000
Subject: [PATCH 13/20] Highway implementation of Convolutional Encoder and
 Decoder

---
 CMakeLists.txt                                |  16 +-
 armral_hwy.cmake.in                           |  20 +-
 .../highway/arm_convolutional_decoder.cpp     | 359 ++++++++++++++++++
 .../highway/arm_convolutional_encoder.cpp     | 154 ++++++++
 src/utils/hwy_types.hpp                       |   1 +
 5 files changed, 534 insertions(+), 16 deletions(-)
 create mode 100644 src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
 create mode 100644 src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d38db8b..761c652 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -426,10 +426,10 @@ if(BUILD_TESTING)
     add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
     add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
     add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-    # add_armral_test(tail_biting_convolutional_decoding
-    #                 test/UpperPHY/ConvolutionalDecoder/main.cpp)
-    # add_armral_test(tail_biting_convolutional_encoding
-    #                 test/UpperPHY/ConvolutionalEncoder/main.cpp)
+    add_armral_test(tail_biting_convolutional_decoding
+                    test/UpperPHY/ConvolutionalDecoder/main.cpp)
+    add_armral_test(tail_biting_convolutional_encoding
+                    test/UpperPHY/ConvolutionalEncoder/main.cpp)
     add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
     add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
     add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
@@ -636,10 +636,10 @@ if(BUILD_TESTING)
     add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
     add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
     add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-    # add_armral_bench(tail_biting_convolutional_decoding
-    #                  bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-    # add_armral_bench(tail_biting_convolutional_encoding
-    #                  bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+    add_armral_bench(tail_biting_convolutional_decoding
+                     bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+    add_armral_bench(tail_biting_convolutional_encoding
+                     bench/UpperPHY/ConvolutionalEncoder/main.cpp)
     add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
     add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
     add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
index c500d43..a61f82b 100644
--- a/armral_hwy.cmake.in
+++ b/armral_hwy.cmake.in
@@ -47,16 +47,20 @@ set_property(
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
   APPEND
   PROPERTY COMPILE_DEFINITIONS
            HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
 
-# The SQRDMULH instruction required by demodulation for fixed point
-# multiplication is only available under NEON and SVE2 on aarch64. We have
-# disabled SVE for all Arm platforms when SQRDMULH is required; to avoid falling
-# back to (slower) generic implementations. Additionally disable SVE2 for all
-# Arm platforms for demodulation as the SVE implementation of the
-# OrderedDemote2To operation adds a ~40% overhead to demodulation.
+# For Demodulation disable SVE/SVE2 due to: The VQRDMULH instruction required by
+# demodulation for fixed point multiplication is only available under NEON and
+# SVE2 on aarch64, therefore, we have disabled SVE for all Arm platforms when
+# VQRDMULH is required; to avoid falling back to (slower) generic
+# implementations. Additionally disable SVE2 for all Arm platforms for
+# demodulation as the SVE implementation of the OrderedDemote2To operation adds
+# a ~40% overhead to demodulation. For Convolutional Decoder disable SVE/SVE2
+# due to: the SVE implementation of the OrderedDemote2To operation adding a ~40%
+# overhead
 set_property(
   SOURCE
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
@@ -142,8 +146,8 @@ set(ARMRAL_LIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
diff --git a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
new file mode 100644
index 0000000..5749648
--- /dev/null
+++ b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
@@ -0,0 +1,359 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#include "armral.h"
+#include "utils/allocators.hpp"
+
+#include "utils/bits_to_bytes.hpp"
+
+#include "../convolutional_code_table.hpp"
+
+#include <cstdlib>
+#include <cstring>
+
+namespace hn = hwy::HWY_NAMESPACE;
+#include "utils/hwy_types.hpp"
+
+
+
+namespace {
+
+struct pm_s {
+  uint32_t pm;
+  uint8_t i;
+};
+
+void compute_path(uint8_t *dec, uint32_t k, uint8_t states, uint8_t const *prev,
+                  uint8_t *i_ptr) {
+  // Final state index
+  uint8_t i = *i_ptr;
+
+  // Compute path and decoded stream
+  for (uint32_t j = k; j > 0; j--) {
+    // For the states belonging to the first half ([0; 31]) the decoded bit is
+    // 0, for the other ones ([32; 63]) it is 1
+    if (i < 32) {
+      dec[j - 1] = 0;
+    } else {
+      dec[j - 1] = 1;
+    }
+
+    i = prev[(j - 1) * states + i];
+  }
+
+  // Initial state index
+  *i_ptr = i;
+}
+
+int cmp(const void *a, const void *b) {
+  int ret;
+  const pm_s ia = *static_cast<const pm_s *>(a);
+  const pm_s ib = *static_cast<const pm_s *>(b);
+
+  if (ia.pm < ib.pm) {
+    ret = -1;
+  } else {
+    ret = 1;
+  }
+
+  return ret;
+}
+
+template<typename Allocator>
+armral_status tail_biting_convolutional_decode_block(
+    const int8_t *__restrict src0, const int8_t *__restrict src1,
+    const int8_t *__restrict src2, uint32_t k, uint32_t iter_max, uint8_t *dst,
+    Allocator &allocator) {
+  constexpr uint8_t states = 64; // 6 memory bits => 2^6 = 64 states
+
+  auto initial_sm = allocate_zeroed<int32_t>(allocator, states);
+  auto intermediate_sm = allocate_zeroed<int32_t>(allocator, states);
+  auto final_sm = allocate_uninitialized<int32_t>(allocator, states);
+  auto initial_states_i = allocate_uninitialized<uint8_t>(allocator, states);
+  auto pm_v = allocate_uninitialized<pm_s>(allocator, states);
+  // [states x K] matrix (row-major)
+  auto bytes_dst = allocate_uninitialized<uint8_t>(allocator, states * k);
+  // [K x states] matrix
+  auto prev = allocate_zeroed<uint8_t>(allocator, k * states);
+
+  if constexpr (Allocator::is_counting) {
+    return ARMRAL_SUCCESS;
+  }
+
+  uint8_t ro_best_i;
+  uint8_t ro_tb_best_i = states; // Initialized with impossible value
+
+  uint8_t iter_cnt = 0;
+  uint32_t preva_init_data[] = {0, 2, 4, 6};
+  uint32_t prevb_init_data[] = {1, 3, 5, 7};
+  Vec_u32x4 preva_init = hn::LoadU(du32x4, preva_init_data);
+  Vec_u32x4 prevb_init = hn::LoadU(du32x4, prevb_init_data);
+  Vec_u32x4 all_8s = hn::Set(du32x4, 8);
+  Vec_i16x8 all_765s = hn::Set(di16x8, 765);
+
+  // Start WAVA
+  do {
+    iter_cnt++;
+
+    // == Compute branch and state metrics ==
+    for (uint32_t i = 0; i < k; i++) {
+      // Given a state of the trellis (j), the two previous ones are always j <<
+      // 1 (preva) and j << 1 + 1 (prevb). In the inner loop we iterate over all
+      // the states in order, from 0 to states - 1, hence the preva states will
+      // be 0, 2, 4... and the prevb states will be 1, 3, 5... In this
+      // implementation we consider 16 states at a time.
+      Vec_u32x4 preva = preva_init;
+      Vec_u32x4 prevb = prevb_init;
+
+      Vec_i8x16 s0 = hn::Set(di8x16, src0[i]);
+      Vec_i8x16 s1 = hn::Set(di8x16, src1[i]);
+      Vec_i8x16 s2 = hn::Set(di8x16, src2[i]);
+
+      Vec_i16x8 s0_low = hn::PromoteLowerTo(di16x8, s0);
+      Vec_i16x8 s0_high = hn::PromoteUpperTo(di16x8, s0);
+      Vec_i16x8 s1_low = hn::PromoteLowerTo(di16x8, s1);
+      Vec_i16x8 s1_high = hn::PromoteUpperTo(di16x8, s1);
+      Vec_i16x8 s2_low = hn::PromoteLowerTo(di16x8, s2);
+      Vec_i16x8 s2_high = hn::PromoteUpperTo(di16x8, s2);
+
+      // In memory we stored the codewords for each state (table0 contains the
+      // possible codewords when x[k]=0, table1 when x[k] = 1, they correspond
+      // to the first and the second half of the trellis, since the first
+      // element in the state is x[k] of the previous stage). Only the codewords
+      // for the even states have been stored (the codewords of the odd states
+      // can be computed as the inverse of the codeword of the previous odd
+      // state). This is why this loop goes from 0 to states / 2.
+      for (uint8_t j = 0; j < states / 2; j += 16) {
+
+        // Compute table0 branch metrics (bma and bmb)
+
+        Vec_i8x16 t00 = hn::LoadU(di8x16, table0_0 + j);
+        Vec_i8x16 t01 = hn::LoadU(di8x16, table0_1 + j);
+        Vec_i8x16 t02 = hn::LoadU(di8x16, table0_2 + j);
+        // bma = abs(s0 - t0)
+        Vec_i16x8 bma0_lo = hn::Set(di16x8, 0);
+        Vec_i16x8 bma0_hi = hn::Set(di16x8, 0);
+        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t00)));
+        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t00)));
+        // bma += abs(s1 - t1)
+        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t01)));
+        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t01)));
+        // bma += abs(s2 - t2)
+        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t02)));
+        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t02)));
+        // The branch metric for the prevb state is computed as 3 - the branch
+        // metric of the preva state (Q format, so bmb = 765 - bma)11
+        Vec_i16x8 bmb0_lo = hn::Sub(all_765s, bma0_lo);
+        Vec_i16x8 bmb0_hi = hn::Sub(all_765s, bma0_hi);
+
+        // Compute table1 branch metrics (bma and bmb)
+
+        Vec_i8x16 t10 =hn::LoadU(di8x16, table1_0 + j);
+        Vec_i8x16 t11 =hn::LoadU(di8x16, table1_1 + j);
+        Vec_i8x16 t12 =hn::LoadU(di8x16, table1_2 + j);
+        Vec_i16x8 bma1_lo = hn::Set(di16x8, 0);
+        Vec_i16x8 bma1_hi = hn::Set(di16x8, 0);
+        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t10)));
+        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t10)));
+        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t11)));
+        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t11)));
+        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t12)));
+        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t12)));
+        Vec_i16x8 bmb1_lo = hn::Sub(all_765s, bma1_lo);
+        Vec_i16x8 bmb1_hi = hn::Sub(all_765s, bma1_hi);
+
+        // Compute table0 state metrics and previous states matrix
+
+        // Possible previous states:
+        // prev_state_a = 0, 2, 4, ...
+        // prev_state_b = 1, 3, 5, ...
+        // Load intermediate_sm[prev_state_a] and intermediate_sm[prev_state_b]
+        Vec_i32x4 int_sm_0 = hn::Undefined(di32x4);
+        Vec_i32x4 int_sm_1 = hn::Undefined(di32x4);
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[2 * j], int_sm_0, int_sm_1);
+
+        // intermediate_sm[prev_state_a] + bma
+        Vec_i32x4 int_bma0 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, bma0_lo));
+        // intermediate_sm[prev_state_b] + bmb
+        Vec_i32x4 int_bmb0 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, bmb0_lo));
+        // if (intermediate_sm[prev_state_a] + bm_a >
+        //     intermediate_sm[prev_state_b] + bm_b)
+        //   prev[j][i] = prev_state_a;
+        //   final_sm[j] = intermediate_sm[prev_state_a] + bm_a;
+        Mask_u32x4 pred = hn::RebindMask(du32x4, hn::Le(int_bma0, int_bmb0));
+        Vec_i32x4 finalsm = hn::Max(int_bmb0, int_bma0);
+        hn::StoreU(finalsm, di32x4, &final_sm[j]);
+        Vec_u32x4 prevab0ll = hn::IfThenElse(pred, prevb, preva);
+
+        // Compute table1 state metrics and previous states matrix
+
+        Vec_i32x4 int_bma1 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma1_lo)));
+        Vec_i32x4 int_bmb1 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb1_lo)));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma1, int_bmb1));
+        finalsm = hn::Max(int_bmb1, int_bma1);
+        hn::StoreU(finalsm, di32x4, &final_sm[j + states / 2]);
+        Vec_u32x4 prevab1ll = hn::IfThenElse(pred, prevb, preva);
+
+        // Update previous state vectors (the two vectors contain four
+        // consecutive even values and four consecutive odd values, hence we can
+        // add 8 to each element to obtain the next states in the sequence, e.g.
+        // [0, 2, 4, 6] -> [8, 10, 12, 14])
+        preva = hn::Add(preva, all_8s);
+        prevb = hn::Add(prevb, all_8s);
+
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[8 + 2 * j], int_sm_0, int_sm_1);
+
+        int_bma0 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma0_lo));
+        int_bmb0 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb0_lo));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma0, int_bmb0));
+        finalsm = hn::Max(int_bmb0, int_bma0);
+        hn::StoreU(finalsm, di32x4, &final_sm[4 + j]);
+        Vec_u32x4 prevab0lh = hn::IfThenElse(pred, prevb, preva);
+
+        int_bma1 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma1_lo));
+        int_bmb1 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb1_lo));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma1, int_bmb1));
+        finalsm = hn::Max(int_bmb1, int_bma1);
+        hn::StoreU(finalsm, di32x4, &final_sm[4 + j + states / 2]);
+        Vec_u32x4 prevab1lh = hn::IfThenElse(pred, prevb, preva);
+
+        preva = hn::Add(preva, all_8s);
+        prevb = hn::Add(prevb, all_8s);
+
+        Vec_u16x8 prevab0l = hn::OrderedDemote2To(du16x8, prevab0ll, prevab0lh);
+        Vec_u16x8 prevab1l = hn::OrderedDemote2To(du16x8, prevab1ll, prevab1lh);
+
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[16 + 2 * j], int_sm_0, int_sm_1);
+
+        int_bma0 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma0_hi)));
+        int_bmb0 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb0_hi)));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma0, int_bmb0));
+        finalsm = hn::Max(int_bmb0, int_bma0);
+        hn::StoreU(finalsm, di32x4, &final_sm[8 + j]);
+        Vec_u32x4 prevab0hl = hn::IfThenElse(pred, prevb, preva);
+
+        int_bma1 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma1_hi)));
+        int_bmb1 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb1_hi)));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma1, int_bmb1));
+        finalsm = hn::Max(int_bmb1, int_bma1);
+        hn::StoreU(finalsm, di32x4, &final_sm[8 + j + states / 2]);
+        Vec_u32x4 prevab1hl = hn::IfThenElse(pred, prevb, preva);
+
+        preva = hn::Add(preva, all_8s);
+        prevb = hn::Add(prevb, all_8s);
+
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[24 + 2 * j], int_sm_0, int_sm_1);
+
+        int_bma0 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma0_hi));
+        int_bmb0 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb0_hi));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma0, int_bmb0));
+        finalsm = hn::Max(int_bmb0, int_bma0);
+        hn::StoreU(finalsm, di32x4, &final_sm[12 + j]);
+        Vec_u32x4 prevab0hh = hn::IfThenElse(pred, prevb, preva);
+
+        int_bma1 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma1_hi));
+        int_bmb1 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb1_hi));
+        pred = hn::RebindMask(du32x4, hn::Le(int_bma1, int_bmb1));
+        finalsm = hn::Max(int_bmb1, int_bma1);
+        hn::StoreU(finalsm, di32x4, &final_sm[12 + j + states / 2]);
+        Vec_u32x4 prevab1hh = hn::IfThenElse(pred, prevb, preva);
+
+        Vec_u16x8 prevab0h = hn::OrderedDemote2To(du16x8, prevab0hl, prevab0hh);
+        Vec_u16x8 prevab1h = hn::OrderedDemote2To(du16x8, prevab1hl, prevab1hh);
+
+        // Finally store the previous values
+        Vec_u8x16 prevab0 = hn::OrderedDemote2To(du8x16, prevab0l, prevab0h);
+        hn::StoreU(prevab0, du8x16, &prev[i * states + j]);
+        Vec_u8x16 prevab1 = hn::OrderedDemote2To(du8x16, prevab1l, prevab1h);
+        hn::StoreU(prevab1, du8x16, &prev[i * states + j + states / 2]);
+
+        preva = hn::Add(preva, all_8s);
+        prevb = hn::Add(prevb, all_8s);
+      }
+
+      // Update previous state metrics array
+      memcpy(intermediate_sm.get(), final_sm.get(), states * sizeof(uint32_t));
+    }
+
+    // == Traceback ==
+    // Compute paths (path metrics and decoded stream) and initial states (state
+    // metrics and indices)
+    for (uint8_t i = 0; i < states; i++) {
+      uint8_t state_i = i;
+      compute_path(&bytes_dst[i * k], k, states, (uint8_t const *)prev.get(),
+                   &state_i);
+      pm_v[i].pm = final_sm[i] - initial_sm[state_i];
+      initial_states_i[i] = state_i;
+    }
+
+    // Sort path metrics array and keep track of indices
+    for (uint8_t i = 0; i < states; i++) {
+      pm_v[i].i = i;
+    }
+    qsort(pm_v.get(), states, sizeof(pm_s), cmp);
+
+    // This is the best path (is it the codeword?)
+    ro_best_i = pm_v[0].i;
+
+    // If the best path is also tailbiting (final state = initial state), it is
+    // the codeword, so exit from the while loop
+    if (initial_states_i[pm_v[0].i] == ro_best_i) {
+      ro_tb_best_i = ro_best_i;
+      break;
+    }
+
+    // Codeword not found immediately, another iteration is needed
+    memcpy(initial_sm.get(), final_sm.get(), states * sizeof(uint32_t));
+    memcpy(intermediate_sm.get(), final_sm.get(), states * sizeof(uint32_t));
+
+    // Look for best TB path
+    for (uint8_t i = 1; i < states; i++) {
+      if (initial_states_i[pm_v[i].i] == pm_v[i].i) {
+        ro_tb_best_i = pm_v[i].i;
+        break; // Found, exit from for loop (but stay in while loop for another
+               // WAVA iteration)
+      }
+    }
+  } while (iter_cnt < iter_max);
+
+  // == Output decoded stream ==
+  // Convert the bytes back to bits
+  if (ro_tb_best_i != states) { // if TB path found
+    armral::bytes_to_bits(k, &bytes_dst[ro_tb_best_i * k], dst);
+  } else {
+    armral::bytes_to_bits(k, &bytes_dst[ro_best_i * k], dst);
+  }
+
+  return ARMRAL_SUCCESS;
+}
+
+} // anonymous namespace
+
+armral_status armral_tail_biting_convolutional_decode_block(
+    const int8_t *__restrict src0, const int8_t *__restrict src1,
+    const int8_t *__restrict src2, uint32_t k, uint32_t iter_max,
+    uint8_t *dst) {
+  heap_allocator allocator{};
+  return tail_biting_convolutional_decode_block(src0, src1, src2, k, iter_max,
+                                                dst, allocator);
+}
+
+armral_status armral_tail_biting_convolutional_decode_block_noalloc(
+    const int8_t *__restrict src0, const int8_t *__restrict src1,
+    const int8_t *__restrict src2, uint32_t k, uint32_t iter_max, uint8_t *dst,
+    void *buffer) {
+  buffer_bump_allocator allocator{buffer};
+  return tail_biting_convolutional_decode_block(src0, src1, src2, k, iter_max,
+                                                dst, allocator);
+}
+
+uint32_t armral_tail_biting_convolutional_decode_block_noalloc_buffer_size(
+    uint32_t k, uint32_t iter_max) {
+  counting_allocator allocator{};
+  tail_biting_convolutional_decode_block(nullptr, nullptr, nullptr, k, iter_max,
+                                         nullptr, allocator);
+  return allocator.required_bytes();
+}
diff --git a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
new file mode 100644
index 0000000..e029f81
--- /dev/null
+++ b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
@@ -0,0 +1,154 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#include "armral.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include <hwy/highway.h>
+#include "utils/hwy_types.hpp"
+namespace hn = hwy::HWY_NAMESPACE;
+
+
+
+armral_status armral_tail_biting_convolutional_encode_block(const uint8_t *src,
+                                                            uint32_t k,
+                                                            uint8_t *dst0,
+                                                            uint8_t *dst1,
+                                                            uint8_t *dst2) {
+
+  // y0[n] = x[n] + s[1] + s[2] + s[4] + s[5]
+  // y1[n] = x[n] + s[0] + s[1] + s[2] + s[5]
+  // y2[n] = x[n] + s[0] + s[1] + s[3] + s[5]
+  // The shift register (s) is initialized with the last six
+  // information bits of the input stream (s[i] = x[k-1-i])
+
+  uint64_t pmask0 = 0b01011011;
+  uint64_t pmask1 = 0b01111001;
+  uint64_t pmask2 = 0b01110101;
+
+  uint32_t i = 0;
+
+  // Iterate the main loop floor((k-8)/56) times
+  for (; i < (k - 8) / 56; i++) {
+    // Load input samples reversed in polynomial
+    Mask_u8 pg = hn::FirstN(du8, 8);
+    Vec_u8 xv = no_sanitize::MaskedLoad(pg, du8, src + i * 7);
+    xv = hn::Reverse8(du8, xv);
+    uint64_t x = hn::GetLane(hn::BitCast(du64, xv));
+
+    // Compute outputs (skip the first byte)
+    Vec_u64 y0_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
+    Vec_u64 y1_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
+    Vec_u64 y2_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
+
+    // Delete the 6 bits tail
+    y0_temp = hn::ShiftRight<6>(y0_temp);
+    y1_temp = hn::ShiftRight<6>(y1_temp);
+    y2_temp = hn::ShiftRight<6>(y2_temp);
+
+    // Reverse again
+    Vec_u8 y0 = hn::BitCast(du8, y0_temp);
+    Vec_u8 y1 = hn::BitCast(du8, y1_temp);
+    Vec_u8 y2 = hn::BitCast(du8, y2_temp);
+    y0 = hn::Reverse8(du8, y0);
+    y1 = hn::Reverse8(du8, y1);
+    y2 = hn::Reverse8(du8, y2);
+
+    // Store 7 bytes in the output arrays
+    y0 = hn::ShiftRightLanes<1>(du8, y0);
+    y1 = hn::ShiftRightLanes<1>(du8, y1);
+    y2 = hn::ShiftRightLanes<1>(du8, y2);
+
+    hn::StoreN(y0, du8, dst0 + 1 + i * 7, 8 - 1);
+    hn::StoreN(y1, du8, dst1 + 1 + i * 7, 8 - 1);
+    hn::StoreN(y2, du8, dst2 + 1 + i * 7, 8 - 1);
+  }
+
+  int rem = k - 56 * i; // Remaining bits in the input
+
+  // Tail
+  if ((k - 8) % 56 != 0) {
+    // Load input samples reversed in polynomial
+    Mask_u8 pg_tail = hn::FirstN(du8, (rem + 7) / 8);
+    Vec_u8 xv_tail = no_sanitize::MaskedLoad(pg_tail, du8, src + i * 7);
+    xv_tail = hn::Reverse8(du8, xv_tail);
+    uint64_t x = hn::GetLane(hn::BitCast(du64, xv_tail));
+
+    // Compute outputs (skip the first byte)
+    Vec_u64 y0_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
+    Vec_u64 y1_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
+    Vec_u64 y2_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
+
+    // Delete zeros (64 - rem) and tail (6)
+    y0_temp = hn::ShiftRightSame(y0_temp, (64 - rem + 6));
+    y1_temp = hn::ShiftRightSame(y1_temp, (64 - rem + 6));
+    y2_temp = hn::ShiftRightSame(y2_temp, (64 - rem + 6));
+
+    // zero padding on the right
+    y0_temp = hn::ShiftLeftSame(y0_temp, 7 - ((k - 1) % 8));
+    y1_temp = hn::ShiftLeftSame(y1_temp, 7 - ((k - 1) % 8));
+    y2_temp = hn::ShiftLeftSame(y2_temp, 7 - ((k - 1) % 8));
+
+    // Reverse again
+    Vec_u8 y0 = hn::BitCast(du8, y0_temp);
+    Vec_u8 y1 = hn::BitCast(du8, y1_temp);
+    Vec_u8 y2 = hn::BitCast(du8, y2_temp);
+
+    y0 = hn::Reverse8(du8, y0);
+    y1 = hn::Reverse8(du8, y1);
+    y2 = hn::Reverse8(du8, y2);
+
+    y0 = hn::SlideDownLanes(du8, y0, (8 - (rem + 7) / 8 + 1));
+    y1 = hn::SlideDownLanes(du8, y1, (8 - (rem + 7) / 8 + 1));
+    y2 = hn::SlideDownLanes(du8, y2, (8 - (rem + 7) / 8 + 1));
+
+    hn::StoreN(y0, du8, dst0 + 1 + i * 7, (rem + 7) / 8 - 1);
+    hn::StoreN(y1, du8, dst1 + 1 + i * 7, (rem + 7) / 8 - 1);
+    hn::StoreN(y2, du8, dst2 + 1 + i * 7, (rem + 7) / 8 - 1);
+  }
+
+  // Tail biting part (first 6 bits, i.e. the first byte)
+  uint8_t xpr0 = src[0]; // x[0] ... x[7]
+  uint8_t x_end;         // x[k-8] ... x[k-1]
+  if (k % 8 != 0) {
+    uint8_t x_end1 = src[(k + 7) / 8 - 1];
+    uint8_t x_end2 = src[(k + 7) / 8 - 2];
+    x_end = (x_end2 << k % 8) | (x_end1 >> (8 - k % 8));
+  } else {
+    x_end = src[(k + 7) / 8 - 1];
+  }
+
+  // When the first 6 output samples are computed, in the shift register
+  // there are elements of the input tail (x[k-1] ... x[k-6]). For example,
+  // y0[0] = x[0] + s[1] + s[2] + s[4] + s[5] =
+  //       = x[0] + x[k-2] + x[k-3] + x[k-5] + x[k-6].
+  // We can compute these output samples in parallel, loading and rearranging
+  // the right elements in uint8x8_t types (we ignore the last 2 elements)
+
+  uint8_t xpr1 = (xpr0 >> 1) | (x_end << 7); // x[k-1] x[0] ... x[6]
+  uint8_t xpr2 = (xpr0 >> 2) | (x_end << 6); // x[k-2] x[k-1] x[0] ... x[5]
+  uint8_t xpr3 = (xpr0 >> 3) | (x_end << 5); // x[k-3] ... x[4]
+  uint8_t xpr4 = (xpr0 >> 4) | (x_end << 4); // x[k-4] ... x[3]
+  uint8_t xpr5 = (xpr0 >> 5) | (x_end << 3); // x[k-5] ... x[2]
+  uint8_t xpr6 = (xpr0 >> 6) | (x_end << 2); // x[k-6] ... x[1]
+
+  // Compute the partial sums
+  uint8_t xpr06 = xpr0 ^ xpr6; // x[0] + x[k-6]
+  uint8_t xpr23 = xpr2 ^ xpr3; // x[k-2] + x[k-3]
+  uint8_t xpr12 = xpr1 ^ xpr2; // x[k-1] + x[k-2]
+
+  uint8_t xpr0236 = xpr06 ^ xpr23; // x[0] + x[k-2] + x[k-3] + x[k-6]
+  uint8_t xpr0126 = xpr06 ^ xpr12; // x[0] + x[k-1] + x[k-2] + x[k-6]
+
+  // Compute the final results
+  dst0[0] = xpr0236 ^ xpr5; // x[0] + x[k-2] + x[k-3] + x[k-5] + x[k-6]
+  dst1[0] = xpr0236 ^ xpr1; // x[0] + x[k-1] + x[k-2] + x[k-3] + x[k-6]
+  dst2[0] = xpr0126 ^ xpr4; // x[0] + x[k-1] + x[k-2] + x[k-4] + x[k-6]
+
+  return ARMRAL_SUCCESS;
+}
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 4f976d4..c7ffccd 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -47,6 +47,7 @@ using Vec_i64x2 = hn::Vec<decltype(di64x2)>;
 
 // Mask Types
 using Mask_u8x16 = hn::Mask<decltype(du8x16)>;
+using Mask_u32x4 = hn::Mask<decltype(du32x4)>;
 
 // Rebind Tags
 /* e.g. const hn::Rebind<int8_t, decltype(d16)> di16x8_di8x16;
-- 
GitLab


From f12a93f8413f49edc7666b8b384da36e595b5208 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Thu, 16 Jan 2025 14:32:58 +0000
Subject: [PATCH 14/20] Apply second round of review requests. Replaced CC
 Copyright Notices with CREDITS.md file entries.

---
 CMakeLists.txt                          |  2 +-
 CREDITS.md                              | 11 +++++++++++
 armral_hwy.cmake.in => armral_hwy.cmake |  0
 src/UpperPHY/CRC/highway/crc_common.hpp |  4 +---
 src/utils/hwy_types.hpp                 |  5 ++---
 5 files changed, 15 insertions(+), 7 deletions(-)
 rename armral_hwy.cmake.in => armral_hwy.cmake (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a01db46..448c44b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,7 +269,7 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
 endif()
 
 if(ARMRAL_ARCH STREQUAL "HWY")
-  include(armral_hwy.cmake.in)
+  include(armral_hwy.cmake)
   return()
 endif()
 
diff --git a/CREDITS.md b/CREDITS.md
index 0271d77..4b2b3b9 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -33,3 +33,14 @@ Acceleration Library:
   to support soft buffer sizes was contributed upstream by 4g5g
   Consultants. See
   <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
+
+- Addition of a Google Highway as a fourth architecture `-DARMRAL_ARCH=HWY`.
+  Enabling future development using Google Highway platform agnostic
+  intrinsic implementations was contributed upstream by Cambridge
+  Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
+
+- Addition of the Google Highway crc implementation in
+  `src/UpperPHY/CRC/highway/crc_common.hpp` was contributed by
+  Cambridge Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
\ No newline at end of file
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake
similarity index 100%
rename from armral_hwy.cmake.in
rename to armral_hwy.cmake
diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
index 5877bfa..11c2741 100644
--- a/src/UpperPHY/CRC/highway/crc_common.hpp
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -1,13 +1,11 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-   Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-#include <hwy/highway.h>
 #include "utils/hwy_types.hpp"
+#include <hwy/highway.h>
 
 namespace hn = hwy::HWY_NAMESPACE;
 
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 91f4bf1..ddbe35d 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -33,7 +33,7 @@ const hn::Full128<int32_t> di32x4;
 const hn::Full128<uint64_t> du64x2;
 const hn::Full128<int64_t> di64x2;
 
-// Vector Types 
+// Vector Types
 using Vec_u8x16 = hn::Vec<decltype(du8x16)>;
 using Vec_i8x16 = hn::Vec<decltype(di8x16)>;
 using Vec_u16x8 = hn::Vec<decltype(du16x8)>;
@@ -50,7 +50,6 @@ which the rebind tag is created from and the second is the
 new tag type. These are used in operations where output vector
 width is different from that of the input. */
 
-
 /*
 Scalable vector types. The default choice should be to use
 these vector types since it allows for processing of more
@@ -72,7 +71,7 @@ const hn::ScalableTag<int32_t> di32;
 const hn::ScalableTag<uint64_t> du64;
 const hn::ScalableTag<int64_t> di64;
 
-// Vector Types 
+// Vector Types
 using Vec_u8 = hn::Vec<decltype(du8)>;
 using Vec_i8 = hn::Vec<decltype(di8)>;
 using Vec_u16 = hn::Vec<decltype(du16)>;
-- 
GitLab


From 98f368de2f6e58896647cadd82f2119c8e4d05dc Mon Sep 17 00:00:00 2001
From: William van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Fri, 17 Jan 2025 08:46:13 +0000
Subject: [PATCH 15/20] Reorder CREDITS.md file as requested

---
 CREDITS.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/CREDITS.md b/CREDITS.md
index 4b2b3b9..e883c27 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -2,6 +2,17 @@ In addition to the primary development being done by Arm, the
 following people and organizations have contributed to Arm RAN
 Acceleration Library:
 
+- Addition of the Google Highway crc implementation in
+  `src/UpperPHY/CRC/highway/crc_common.hpp` was contributed by
+  Cambridge Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
+  
+- Addition of a Google Highway as a fourth architecture `-DARMRAL_ARCH=HWY`.
+  Enabling future development using Google Highway platform agnostic
+  intrinsic implementations was contributed upstream by Cambridge
+  Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
+
 - Work on `armral_ldpc_rate_recovery` to correctly set the
   log-likelihood ratios of filler bits was contributed upstream by
   4g5g Consultants. See
@@ -32,15 +43,4 @@ Acceleration Library:
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support soft buffer sizes was contributed upstream by 4g5g
   Consultants. See
-  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
-
-- Addition of a Google Highway as a fourth architecture `-DARMRAL_ARCH=HWY`.
-  Enabling future development using Google Highway platform agnostic
-  intrinsic implementations was contributed upstream by Cambridge
-  Consultants. See
-  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
-
-- Addition of the Google Highway crc implementation in
-  `src/UpperPHY/CRC/highway/crc_common.hpp` was contributed by
-  Cambridge Consultants. See
-  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
\ No newline at end of file
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
\ No newline at end of file
-- 
GitLab


From 6447e5bcd4983a4310f6654f57813c48cfb12c09 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Mon, 20 Jan 2025 10:21:16 +0000
Subject: [PATCH 16/20] Apply changes highlighted in Arm CRC review, including:

* Remove Copyright Notices and replace them with CREDITS.md file entries
* Rename.cmake.in to .cmake
* Run cmake-format on code
* Run clang-format on code
* Remove armral_acle and ensure that minimum changes are made to CMakeLists.txt
* Do not use seperate acle header files rather use and ifdef to ensure minimal changes to `arm` code.
---
 CMakeLists.txt                                | 1090 +++++++----------
 CREDITS.md                                    |   15 +-
 armral_acle.cmake.in                          |  136 --
 armral_hwy.cmake                              |  721 +++++++++++
 armral_hwy.cmake.in                           |  176 ---
 .../Scrambling/highway/arm_scrambling.cpp     |   10 +-
 .../highway/arm_mat_seq_generator.cpp         |   34 +-
 .../highway/arm_convolutional_decoder.cpp     |   70 +-
 .../highway/arm_convolutional_encoder.cpp     |   24 +-
 .../Demodulation/highway/arm_demodulation.cpp |   22 +-
 src/UpperPHY/LDPC/highway/ldpc_decoder.cpp    |    2 -
 src/UpperPHY/LDPC/highway/ldpc_encoder.cpp    |    2 -
 .../Modulation/highway/arm_modulation.cpp     |   83 +-
 src/utils/acle/bits_to_bytes.hpp              |  128 --
 src/utils/bits_to_bytes.hpp                   |  130 +-
 src/utils/highway/bits_to_bytes.hpp           |   27 +-
 src/utils/hwy_types.hpp                       |    2 -
 17 files changed, 1464 insertions(+), 1208 deletions(-)
 delete mode 100644 armral_acle.cmake.in
 create mode 100644 armral_hwy.cmake
 delete mode 100644 armral_hwy.cmake.in
 delete mode 100644 src/utils/acle/bits_to_bytes.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 761c652..448c44b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,94 @@ set(ARMRAL_ARCH
     NEON
     CACHE STRING
           "The architecture to build for ('NEON', 'SVE', 'SVE2' or 'HWY')")
-set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE" "SVE2" "HWY")
+set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2")
+
+set(ARMRAL_LIB_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
 
 # Per source file compiler flag overrides/additions
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
@@ -181,6 +268,59 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
   set(JOB_POOL_CONSOLE JOB_POOL console)
 endif()
 
+if(ARMRAL_ARCH STREQUAL "HWY")
+  include(armral_hwy.cmake)
+  return()
+endif()
+
+if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # If the optimization flags are already set, don't try and guess what they
+  # should be.
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.5-a+sve2+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.2-a+sve+crypto+fp16"
+        CACHE INTERNAL "")
+  elseif(ARMRAL_ARCH STREQUAL "NEON")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8-a+crypto"
+        CACHE INTERNAL "")
+  else()
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  # We explicitly set the optimization flags, so just copy those. We still need
+  # to set the appropriate SVE version definition
+  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+else()
+  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
+  if(ARMRAL_ARCH STREQUAL "SVE2")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
+  elseif(ARMRAL_ARCH STREQUAL "SVE")
+    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
+  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+  endif()
+endif()
+
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   set(ARMRAL_COMPILER_FLAGS
       ${ARMRAL_COMPILER_FLAGS}
@@ -210,23 +350,14 @@ else()
   set(ARMRAL_LINKER_FLAGS "")
 endif()
 
-add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
-# The armral library is defined within the include files
-#
-if(ARMRAL_ARCH STREQUAL "HWY")
-  # The armral_utils library will have additional link libraries added within
-  # this include
-  include(armral_hwy.cmake.in)
-else()
-  include(armral_acle.cmake.in)
-endif()
-
+add_library(armral ${ARMRAL_LIB_SOURCES})
 target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
                                       ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
 
+add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
 target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
 target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
@@ -351,635 +482,300 @@ if(BUILD_TESTING)
       DEPENDS bench_${BENCH_NAME})
   endfunction()
 
-  # Temporary duplication while porting is in progress to maintain the order of
-  # bench_excel_summary output
-  if(ARMRAL_ARCH STREQUAL "HWY")
-    # cmake-format: off
-    # add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
-    # add_armral_test(matrix_inv_single
-    #                 test/BasicMathFun/MatrixInv/Single/main.cpp)
-    # add_armral_test(arm_solve
-    #                 test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_batch_16
-    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_batch_32
-    #   test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
-    # add_armral_test(matrix_mult_16
-    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
-    # add_armral_test(matrix_mult_32
-    #                 test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
-    # add_armral_test(
-    #   matrix_mult_aah_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    # add_armral_test(
-    #   matrix_mult_ahb_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_single_16
-    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
-    # add_armral_test(
-    #   matrix_vector_mult_single_32
-    #   test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    # add_armral_test(matrix_pseudo_inv_direct
-    #                 test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    # add_armral_test(vec_dot_16
-    #                 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    # add_armral_test(vec_dot_16_2
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    # add_armral_test(vec_dot_16_2_32_bit
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    # add_armral_test(vec_dot_16_32_bit
-    #                 test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    # add_armral_test(vec_dot_32
-    #                 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    # add_armral_test(vec_dot_32_2
-    #                 test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    # add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    # add_armral_test(vec_mul_16_2
-    #                 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    # add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    # add_armral_test(vec_mul_32_2
-    #                 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    # add_armral_test(mu_law_compression
-    #                 test/DuRuInterface/MuLaw/Compression/main.cpp)
-    # add_armral_test(mu_law_decompression
-    #                 test/DuRuInterface/MuLaw/Decompression/main.cpp)
-    # add_armral_test(block_float_compression
-    #                 test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
-    # add_armral_test(block_float_decompression
-    #                 test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
-    # add_armral_test(block_scaling_compression
-    #                 test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
-    # add_armral_test(block_scaling_decompression
-    #                 test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
-    # add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
-    # add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
-    # add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
-    # add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
-    # add_armral_test(arm_fir_filter_cs16_decimate_2
-    #                 test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    # add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
-    # add_armral_test(arm_fir_filter_cf32_decimate_2
-    #                 test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-    add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-    add_armral_test(tail_biting_convolutional_decoding
-                    test/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_test(tail_biting_convolutional_encoding
-                    test/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-    add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
-    # add_armral_test(polar_crc_attachment
-    #                 test/UpperPHY/Polar/CrcAttachment/main.cpp)
-    # add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
-    # add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
-    # add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
-    # add_armral_test(polar_rate_matching
-    #                 test/UpperPHY/Polar/RateMatching/main.cpp)
-    # add_armral_test(polar_rate_recovery
-    #                 test/UpperPHY/Polar/RateRecovery/main.cpp)
-    # add_armral_test(polar_subchannel_deinterleave
-    #                 test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    # add_armral_test(polar_subchannel_interleave
-    #                 test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    # add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-    # add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-    # add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-    # add_armral_test(turbo_rate_matching
-    #                 test/UpperPHY/Turbo/RateMatching/main.cpp)
-    # add_armral_test(turbo_rate_recovery
-    #                 test/UpperPHY/Turbo/RateRecovery/main.cpp)
-    # add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
-
-    # add_armral_bench(
-    #   matrix_inv_batch_general
-    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_general_pa
-    #   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_hermitian
-    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_batch_hermitian_pa
-    #   bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-    # add_armral_bench(matrix_inv_single_general
-    #                  bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
-    # add_armral_bench(
-    #   matrix_inv_single_hermitian
-    #   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
-    # add_armral_bench(arm_solve_1x2
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-    # add_armral_bench(arm_solve_1x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-    # add_armral_bench(arm_solve_2x2
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-    # add_armral_bench(arm_solve_2x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-    # add_armral_bench(arm_solve_4x4
-    #                  bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_32b_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_i16_64b_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_f32
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_batch_f32_pa
-    #   bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_2x2_iq
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_2x2
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_4x4_iq
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_f32_4x4
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-    # add_armral_bench(
-    #   matmul_f32_general
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_aah_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    # add_armral_bench(
-    #   matrix_mult_ahb_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_i16_32b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_i16_64b
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-    # add_armral_bench(
-    #   matrix_vector_mult_32
-    #   bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    # add_armral_bench(matrix_pseudo_inv_direct
-    #                  bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    # add_armral_bench(vec_dot_16
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    # add_armral_bench(vec_dot_16_2
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    # add_armral_bench(vec_dot_16_2_32_bit
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    # add_armral_bench(vec_dot_16_32_bit
-    #                  bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    # add_armral_bench(vec_dot_32
-    #                  bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    # add_armral_bench(vec_dot_32_2
-    #                  bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    # add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    # add_armral_bench(vec_mul_16_2
-    #                  bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    # add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    # add_armral_bench(vec_mul_32_2
-    #                  bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    # add_armral_bench(mu_law_compression_14bit
-    #                  bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
-    # add_armral_bench(mu_law_compression_8bit
-    #                  bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
-    # add_armral_bench(mu_law_compression_9bit
-    #                  bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_14bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_8bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
-    # add_armral_bench(mu_law_decompression_9bit
-    #                  bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_12bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_14bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_8bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_compression_9bit
-    #   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_12bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_14bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_8bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_float_decompression_9bit
-    #   bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_14bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_8bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_compression_9bit
-    #   bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_14bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_8bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
-    # add_armral_bench(
-    #   block_scaling_decompression_9bit
-    #   bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
-    # add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
-    # add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
-    # add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
-    # add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
-    # add_armral_bench(arm_fir_filter_cs16_decimate_2
-    #                  bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    # add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
-    # add_armral_bench(arm_fir_filter_cf32_decimate_2
-    #                  bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-    add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-    add_armral_bench(tail_biting_convolutional_decoding
-                     bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_bench(tail_biting_convolutional_encoding
-                     bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-    add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_bench(ldpc_rate_matching
-                     bench/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_bench(ldpc_rate_recovery
-                     bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
-    # add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
-    # add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
-    # add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
-    # add_armral_bench(polar_rate_matching
-    #                  bench/UpperPHY/Polar/RateMatching/main.cpp)
-    # add_armral_bench(polar_rate_recovery
-    #                  bench/UpperPHY/Polar/RateRecovery/main.cpp)
-    # add_armral_bench(polar_subchannel_deinterleave
-    #                  bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    # add_armral_bench(polar_subchannel_interleave
-    #                  bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    # add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-    # add_armral_bench(turbo_rate_matching
-    #                  bench/UpperPHY/Turbo/RateMatching/main.cpp)
-    # add_armral_bench(turbo_rate_recovery
-    #                  bench/UpperPHY/Turbo/RateRecovery/main.cpp)
-    # add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
-    # cmake-format: on
-  else()
-    add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
-    add_armral_test(matrix_inv_single
-                    test/BasicMathFun/MatrixInv/Single/main.cpp)
-    add_armral_test(arm_solve
-                    test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_batch_16
-      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_batch_32
-      test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
-    add_armral_test(matrix_mult_16
-                    test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
-    add_armral_test(matrix_mult_32
-                    test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
-    add_armral_test(
-      matrix_mult_aah_32
-      test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    add_armral_test(
-      matrix_mult_ahb_32
-      test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_single_16
-      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
-    add_armral_test(
-      matrix_vector_mult_single_32
-      test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    add_armral_test(matrix_pseudo_inv_direct
-                    test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    add_armral_test(vec_dot_16
-                    test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    add_armral_test(vec_dot_16_2
-                    test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    add_armral_test(vec_dot_16_2_32_bit
-                    test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    add_armral_test(vec_dot_16_32_bit
-                    test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    add_armral_test(vec_dot_32
-                    test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    add_armral_test(vec_dot_32_2
-                    test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    add_armral_test(vec_mul_16_2
-                    test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    add_armral_test(vec_mul_32_2
-                    test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    add_armral_test(mu_law_compression
-                    test/DuRuInterface/MuLaw/Compression/main.cpp)
-    add_armral_test(mu_law_decompression
-                    test/DuRuInterface/MuLaw/Decompression/main.cpp)
-    add_armral_test(block_float_compression
-                    test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
-    add_armral_test(block_float_decompression
-                    test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
-    add_armral_test(block_scaling_compression
-                    test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
-    add_armral_test(block_scaling_decompression
-                    test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
-    add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
-    add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
-    add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
-    add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
-    add_armral_test(arm_fir_filter_cs16_decimate_2
-                    test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
-    add_armral_test(arm_fir_filter_cf32_decimate_2
-                    test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
-    add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_test(crc test/UpperPHY/CRC/main.cpp)
-    add_armral_test(tail_biting_convolutional_decoding
-                    test/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_test(tail_biting_convolutional_encoding
-                    test/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
-    add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
-    add_armral_test(polar_crc_attachment
-                    test/UpperPHY/Polar/CrcAttachment/main.cpp)
-    add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
-    add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
-    add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
-    add_armral_test(polar_rate_matching
-                    test/UpperPHY/Polar/RateMatching/main.cpp)
-    add_armral_test(polar_rate_recovery
-                    test/UpperPHY/Polar/RateRecovery/main.cpp)
-    add_armral_test(polar_subchannel_deinterleave
-                    test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    add_armral_test(polar_subchannel_interleave
-                    test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-    add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-    add_armral_test(turbo_rate_matching
-                    test/UpperPHY/Turbo/RateMatching/main.cpp)
-    add_armral_test(turbo_rate_recovery
-                    test/UpperPHY/Turbo/RateRecovery/main.cpp)
-    add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
-
-    add_armral_bench(
-      matrix_inv_batch_general
-      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_general_pa
-      bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_hermitian
-      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_inv_batch_hermitian_pa
-      bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-    add_armral_bench(matrix_inv_single_general
-                     bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
-    add_armral_bench(
-      matrix_inv_single_hermitian
-      bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
-    add_armral_bench(arm_solve_1x2
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-    add_armral_bench(arm_solve_1x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-    add_armral_bench(arm_solve_2x2
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-    add_armral_bench(arm_solve_2x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-    add_armral_bench(arm_solve_4x4
-                     bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_32b
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_32b_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_64b
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_i16_64b_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_f32
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_batch_f32_pa
-      bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-    add_armral_bench(
-      matrix_mult_i16_32b
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-    add_armral_bench(
-      matrix_mult_i16_64b
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_2x2_iq
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_2x2
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_4x4_iq
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-    add_armral_bench(
-      matrix_mult_f32_4x4
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-    add_armral_bench(
-      matmul_f32_general
-      bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
-    add_armral_bench(
-      matrix_mult_aah_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-    add_armral_bench(
-      matrix_mult_ahb_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_i16_32b
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_i16_64b
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-    add_armral_bench(
-      matrix_vector_mult_32
-      bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-    add_armral_bench(matrix_pseudo_inv_direct
-                     bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
-    add_armral_bench(vec_dot_16
-                     bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
-    add_armral_bench(vec_dot_16_2
-                     bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
-    add_armral_bench(vec_dot_16_2_32_bit
-                     bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
-    add_armral_bench(vec_dot_16_32_bit
-                     bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
-    add_armral_bench(vec_dot_32
-                     bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
-    add_armral_bench(vec_dot_32_2
-                     bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
-    add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
-    add_armral_bench(vec_mul_16_2
-                     bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
-    add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
-    add_armral_bench(vec_mul_32_2
-                     bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
-    add_armral_bench(mu_law_compression_14bit
-                     bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
-    add_armral_bench(mu_law_compression_8bit
-                     bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
-    add_armral_bench(mu_law_compression_9bit
-                     bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
-    add_armral_bench(mu_law_decompression_14bit
-                     bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
-    add_armral_bench(mu_law_decompression_8bit
-                     bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
-    add_armral_bench(mu_law_decompression_9bit
-                     bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_12bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_14bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_8bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
-    add_armral_bench(
-      block_float_compression_9bit
-      bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_12bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_14bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_8bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
-    add_armral_bench(
-      block_float_decompression_9bit
-      bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_14bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_8bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
-    add_armral_bench(
-      block_scaling_compression_9bit
-      bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_14bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_8bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
-    add_armral_bench(
-      block_scaling_decompression_9bit
-      bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
-    add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
-    add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
-    add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
-    add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
-    add_armral_bench(arm_fir_filter_cs16_decimate_2
-                     bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
-    add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
-    add_armral_bench(arm_fir_filter_cf32_decimate_2
-                     bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
-    add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
-    add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
-    add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
-    add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
-    add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
-    add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
-    add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
-    add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
-    add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
-    add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
-    add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
-    add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
-    add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
-    add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
-    add_armral_bench(tail_biting_convolutional_decoding
-                     bench/UpperPHY/ConvolutionalDecoder/main.cpp)
-    add_armral_bench(tail_biting_convolutional_encoding
-                     bench/UpperPHY/ConvolutionalEncoder/main.cpp)
-    add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
-    add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
-    add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
-    add_armral_bench(ldpc_rate_matching
-                     bench/UpperPHY/LDPC/RateMatching/main.cpp)
-    add_armral_bench(ldpc_rate_recovery
-                     bench/UpperPHY/LDPC/RateRecovery/main.cpp)
-    add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
-    add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
-    add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
-    add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
-    add_armral_bench(polar_rate_matching
-                     bench/UpperPHY/Polar/RateMatching/main.cpp)
-    add_armral_bench(polar_rate_recovery
-                     bench/UpperPHY/Polar/RateRecovery/main.cpp)
-    add_armral_bench(polar_subchannel_deinterleave
-                     bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
-    add_armral_bench(polar_subchannel_interleave
-                     bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-    add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-    add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-    add_armral_bench(turbo_rate_matching
-                     bench/UpperPHY/Turbo/RateMatching/main.cpp)
-    add_armral_bench(turbo_rate_recovery
-                     bench/UpperPHY/Turbo/RateRecovery/main.cpp)
-    add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
-  endif()
+  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+  add_armral_test(arm_solve
+                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_16
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_32
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_mult_16
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+  add_armral_test(matrix_mult_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+  add_armral_test(matrix_mult_aah_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_test(matrix_mult_ahb_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_16
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_32
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_pseudo_inv_direct
+                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_test(vec_dot_16_2
+                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_test(vec_dot_16_2_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_test(vec_dot_16_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_test(vec_dot_32_2
+                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_test(mu_law_compression
+                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+  add_armral_test(mu_law_decompression
+                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+  add_armral_test(block_float_compression
+                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+  add_armral_test(block_float_decompression
+                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+  add_armral_test(block_scaling_compression
+                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+  add_armral_test(block_scaling_decompression
+                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_test(arm_fir_filter_cs16_decimate_2
+                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_test(arm_fir_filter_cf32_decimate_2
+                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+  add_armral_test(tail_biting_convolutional_decoding
+                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_test(tail_biting_convolutional_encoding
+                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+  add_armral_test(polar_crc_attachment
+                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_test(polar_subchannel_deinterleave
+                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_test(polar_subchannel_interleave
+                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+  add_armral_bench(
+    matrix_inv_batch_general
+    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+  add_armral_bench(matrix_inv_batch_general_pa
+                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian_pa
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+  add_armral_bench(matrix_inv_single_general
+                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+  add_armral_bench(matrix_inv_single_hermitian
+                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+  add_armral_bench(arm_solve_1x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+  add_armral_bench(arm_solve_1x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+  add_armral_bench(arm_solve_2x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+  add_armral_bench(arm_solve_2x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+  add_armral_bench(arm_solve_4x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+  add_armral_bench(
+    matmul_f32_general
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+  add_armral_bench(
+    matrix_mult_aah_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_bench(
+    matrix_mult_ahb_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_bench(matrix_pseudo_inv_direct
+                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_bench(vec_dot_16
+                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_bench(vec_dot_16_2
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_bench(vec_dot_16_2_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_bench(vec_dot_16_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_bench(vec_dot_32
+                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_bench(vec_dot_32_2
+                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_bench(vec_mul_16_2
+                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_bench(vec_mul_32_2
+                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_bench(mu_law_compression_14bit
+                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+  add_armral_bench(mu_law_compression_8bit
+                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+  add_armral_bench(mu_law_compression_9bit
+                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+  add_armral_bench(mu_law_decompression_14bit
+                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+  add_armral_bench(mu_law_decompression_8bit
+                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+  add_armral_bench(mu_law_decompression_9bit
+                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+  add_armral_bench(block_float_compression_8bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+  add_armral_bench(block_float_compression_9bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_8bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_9bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16_decimate_2
+                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32_decimate_2
+                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+  add_armral_bench(tail_biting_convolutional_decoding
+                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_bench(tail_biting_convolutional_encoding
+                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_bench(polar_rate_matching
+                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_bench(polar_rate_recovery
+                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_bench(polar_subchannel_deinterleave
+                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_bench(polar_subchannel_interleave
+                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_bench(turbo_rate_matching
+                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_bench(turbo_rate_recovery
+                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
 endif()
 
 if(BUILD_EXAMPLES)
@@ -1009,15 +805,13 @@ if(BUILD_EXAMPLES)
     add_dependencies(run_examples run_${EXAMPLE_EXE})
   endfunction()
 
-  if(NOT ARMRAL_ARCH STREQUAL "HWY")
-    add_armral_example(examples/block_float_9b_example.c)
-    add_armral_example(examples/fft_cf32_example.c 10)
-    add_armral_example(examples/modulation_example.c)
-    add_armral_example(examples/polar_example.cpp 128 100 35)
-  endif()
+  add_armral_example(examples/block_float_9b_example.c)
+  add_armral_example(examples/fft_cf32_example.c 10)
+  add_armral_example(examples/modulation_example.c)
+  add_armral_example(examples/polar_example.cpp 128 100 35)
 endif()
 
-if(BUILD_SIMULATION AND NOT (ARMRAL_ARCH STREQUAL "HWY"))
+if(BUILD_SIMULATION)
   # Include simulation rules and targets This involves building dependencies
   # like AWGN library and OpenMP
   add_subdirectory(simulation)
diff --git a/CREDITS.md b/CREDITS.md
index 0271d77..1249662 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -2,6 +2,19 @@ In addition to the primary development being done by Arm, the
 following people and organizations have contributed to Arm RAN
 Acceleration Library:
 
+- The following Google Highway implementations:
+    `src/LowerPHY/Scrambling/highway/arm_scrambling.cpp`,
+    `src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp`,
+    `src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp`,
+    `src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp`,
+    `src/UpperPHY/Modulation/highway/arm_modulation.cpp`,
+    `src/UpperPHY/Demodulation/highway/arm_demodulation.cpp`,
+    `src/UpperPHY/LDPC/highway/ldpc_encoder.cpp`,
+    `src/UpperPHY/LDPC/highway/ldpc_decoder.cpp`,
+    `src/utils/highway/bits_to_bytes.hpp`
+    have been contributed by Cambridge Consultants. See
+    <https://gitlab.arm.com/networking/ral/-/merge_requests/28>.
+
 - Work on `armral_ldpc_rate_recovery` to correctly set the
   log-likelihood ratios of filler bits was contributed upstream by
   4g5g Consultants. See
@@ -32,4 +45,4 @@ Acceleration Library:
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support soft buffer sizes was contributed upstream by 4g5g
   Consultants. See
-  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
\ No newline at end of file
diff --git a/armral_acle.cmake.in b/armral_acle.cmake.in
deleted file mode 100644
index d1e9c0d..0000000
--- a/armral_acle.cmake.in
+++ /dev/null
@@ -1,136 +0,0 @@
-if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # If the optimization flags are already set, don't try and guess what they
-  # should be.
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.5-a+sve2+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8.2-a+sve+crypto+fp16"
-        CACHE INTERNAL "")
-  elseif(ARMRAL_ARCH STREQUAL "NEON")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS
-        "-march=armv8-a+crypto"
-        CACHE INTERNAL "")
-  else()
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # We explicitly set the optimization flags, so just copy those. We still need
-  # to set the appropriate SVE version definition
-  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-else()
-  set(ARMRAL_ARCH_COMPILE_OPTIONS "")
-  if(ARMRAL_ARCH STREQUAL "SVE2")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-  elseif(ARMRAL_ARCH STREQUAL "SVE")
-    set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-  elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(
-      FATAL_ERROR
-        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
-  endif()
-endif()
-
-set(ARMRAL_LIB_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
-
-add_library(armral ${ARMRAL_LIB_SOURCES})
diff --git a/armral_hwy.cmake b/armral_hwy.cmake
new file mode 100644
index 0000000..12e0274
--- /dev/null
+++ b/armral_hwy.cmake
@@ -0,0 +1,721 @@
+cmake_minimum_required(VERSION 3.10)
+
+# TODO possibly switch highway from a submodule to ExternalProject_Add
+set(HWY_ENABLE_CONTRIB
+    OFF
+    CACHE BOOL "Include HWY contrib/ folder")
+set(HWY_ENABLE_EXAMPLES
+    OFF
+    CACHE BOOL "Build HWY examples")
+# set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install HWY library")
+set(HWY_ENABLE_TESTS
+    OFF
+    CACHE BOOL "Enable HWY tests")
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-march=native" COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+
+if(ARMRAL_OPT_FLAGS)
+  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
+  # handle configuring static dispatch for a specified -m string
+  set(HWY_COMPILE_ONLY_STATIC
+      ON
+      CACHE BOOL "")
+  add_compile_options(${ARMRAL_ARCH_COMPILE_OPTIONS})
+elseif(COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+  # pick a less conservative baseline where possible
+  add_compile_options("-march=native")
+endif()
+add_subdirectory(highway)
+
+set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_HWY=1")
+
+if(ARMRAL_OPT_FLAGS)
+  target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
+endif()
+
+# The PMULL instruction requires the AES extension which is only available under
+# NEON and SVE2 on aarch64. We have disabled SVE for all Arm platforms when
+# PMULL is required; to avoid falling back to (slower) generic implementations
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE_256|HWY_SVE)
+
+# Sequence Generation is only built for NEON. SVE: This is to avoid the generic
+# implementation of PMULL (described above). SVE2: the vbslq_ intrinsic
+# available on NEON is implemented in highway using 2 function calls and 3
+# intrinsics resulting in a significant slowdown.
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
+# For Demodulation disable SVE/SVE2 due to: The VQRDMULH instruction required by
+# demodulation for fixed point multiplication is only available under NEON and
+# SVE2 on aarch64, therefore, we have disabled SVE for all Arm platforms when
+# VQRDMULH is required; to avoid falling back to (slower) generic
+# implementations. Additionally disable SVE2 for all Arm platforms for
+# demodulation as the SVE implementation of the OrderedDemote2To operation adds
+# a ~40% overhead to demodulation.
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+# GCC recognizes the usage of XOR as an associative operation, then it tries to
+# optimize the operation tree in its tree-reassoc pass, but it actually makes
+# the performance much worse. Disabling the tree-assoc pass means that the
+# compiler uses our carefully balanced operation tree instead.
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+  APPEND
+  PROPERTY COMPILE_OPTIONS $<$<C_COMPILER_ID:GNU>:-fno-tree-reassoc>)
+
+set(ARMRAL_LIB_SOURCES
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp
+)
+
+if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
+  set(ARMRAL_COMPILER_FLAGS
+      ${ARMRAL_COMPILER_FLAGS}
+      $<$<COMPILE_LANGUAGE:C>:-Wshadow
+      -Wall
+      -Wcast-qual>
+      $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+      -Wall
+      -Wcast-qual
+      -fno-rtti
+      -fno-exceptions
+      -std=c++17>
+      $<$<CONFIG:DEBUG>:-Og
+      -g3
+      -ggdb
+      -fno-omit-frame-pointer>)
+  # Disable GLIBCXX assertions to avoid introducing dependency on libstdc++
+  add_definitions(-D_GLIBCXX_NO_ASSERTIONS)
+  message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}")
+else()
+  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the
+  # compile line
+  message(STATUS "Overriding compilation flags with manually set flags")
+  message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+  set(ARMRAL_COMPILER_FLAGS "")
+  set(ARMRAL_LINKER_FLAGS "")
+endif()
+
+add_library(armral ${ARMRAL_LIB_SOURCES})
+target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
+target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
+target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                      ${ARMRAL_COMPILER_FLAGS})
+target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
+
+add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
+target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
+target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
+target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                            ${ARMRAL_COMPILER_FLAGS})
+target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS})
+
+target_link_libraries(armral PUBLIC hwy)
+target_link_libraries(armral_utils PUBLIC hwy)
+
+if(ARMRAL_SEMIHOSTING)
+  # When semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag,
+  # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable
+  # ARMRAL_SEMIHOSTING
+  target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING")
+  target_compile_definitions(armral_utils PUBLIC "ARMRAL_SEMIHOSTING")
+endif()
+
+include(GNUInstallDirs)
+install(TARGETS armral DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(
+  DIRECTORY include/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING
+  PATTERN "*.h")
+install(FILES LICENSE.md THIRD_PARTY_LICENSES.md
+        DESTINATION ${CMAKE_INSTALL_DATADIR}/licenses/armral)
+
+if(BUILD_TESTING)
+  include(CTest)
+
+  if(NOT DEFINED BENCHMARKER_SOURCE_DIR)
+    set(BENCHMARKER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+  if(NOT DEFINED BENCHMARKER_BUILD_DIR)
+    set(BENCHMARKER_BUILD_DIR ${CMAKE_BINARY_DIR})
+  endif()
+  if(NOT DEFINED BENCHMARKER_RUNNER)
+    set(BENCHMARKER_RUNNER "${BENCHMARKER_SOURCE_DIR}/bench/default_runner.py")
+  endif()
+
+  add_custom_target(
+    check
+    COMMAND ${CMAKE_CTEST_COMMAND}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_custom_target(
+    bench
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER}
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_concurrent
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_excel_summary
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} | tee
+      ${BENCHMARKER_BUILD_DIR}/out.json
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py
+            ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  set(ARMRAL_TEST_LINK_LIBRARIES armral armral_utils)
+
+  if(STATIC_TESTING)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -static)
+  endif()
+
+  # Utility function to add a test
+  function(add_armral_test TEST_NAME TEST_SOURCE)
+    # Build the actual test executable itself
+    add_executable(${TEST_NAME} ${TEST_SOURCE})
+    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
+    target_include_directories(${TEST_NAME} PRIVATE ${ARMRAL_TEST_INC})
+    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}
+                                                ${ARMRAL_ARCH_COMPILE_OPTIONS})
+
+    # Register it as a test, set up dependencies
+    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER}
+                                       ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
+    if(ARMRAL_ENABLE_ASAN)
+      # Avoid slow-downs in newer versions of Address Santizier
+      # https://github.com/llvm/llvm-project/issues/64190
+      set_tests_properties(
+        ${TEST_NAME} PROPERTIES ENVIRONMENT
+                                "ASAN_OPTIONS=detect_stack_use_after_return=0")
+    endif()
+    add_dependencies(check ${TEST_NAME})
+  endfunction()
+
+  # Utility function to add a benchmark
+  function(add_armral_bench BENCH_NAME BENCH_SOURCE)
+
+    # Build the actual bench executable itself
+    add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE})
+    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
+    target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC})
+    target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS})
+
+    # Register it as a benchmark, set up dependencies
+    add_dependencies(bench bench_${BENCH_NAME})
+    add_dependencies(bench_concurrent bench_${BENCH_NAME})
+    add_dependencies(bench_excel_summary bench_${BENCH_NAME})
+
+    # Add target for running the benchmark
+    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
+    add_custom_target(
+      run_bench_${BENCH_NAME}
+      COMMAND
+        ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR} ${BENCHMARKER_BUILD_DIR}
+        --runner ${BENCHMARKER_RUNNER} --concurrent ${JOB_POOL_CONSOLE}
+      WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}
+      DEPENDS bench_${BENCH_NAME})
+  endfunction()
+
+  # cmake-format: off
+#  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+#  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+#  add_armral_test(arm_solve
+#                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_batch_16
+#    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_batch_32
+#    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+#  add_armral_test(matrix_mult_16
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+#  add_armral_test(matrix_mult_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+#  add_armral_test(matrix_mult_aah_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+#  add_armral_test(matrix_mult_ahb_32
+#                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_single_16
+#    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+#  add_armral_test(
+#    matrix_vector_mult_single_32
+#    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+#  add_armral_test(matrix_pseudo_inv_direct
+#                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+#  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+#  add_armral_test(vec_dot_16_2
+#                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+#  add_armral_test(vec_dot_16_2_32_bit
+#                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+#  add_armral_test(vec_dot_16_32_bit
+#                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+#  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+#  add_armral_test(vec_dot_32_2
+#                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+#  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+#  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+#  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+#  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+#  add_armral_test(mu_law_compression
+#                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+#  add_armral_test(mu_law_decompression
+#                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+#  add_armral_test(block_float_compression
+#                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+#  add_armral_test(block_float_decompression
+#                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+#  add_armral_test(block_scaling_compression
+#                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+#  add_armral_test(block_scaling_decompression
+#                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+#  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+#  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+#  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+#  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+#  add_armral_test(arm_fir_filter_cs16_decimate_2
+#                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+#  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+#  add_armral_test(arm_fir_filter_cf32_decimate_2
+#                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+  add_armral_test(tail_biting_convolutional_decoding
+                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_test(tail_biting_convolutional_encoding
+                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+#  add_armral_test(polar_crc_attachment
+#                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+#  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+#  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+#  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+#  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+#  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+#  add_armral_test(polar_subchannel_deinterleave
+#                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+#  add_armral_test(polar_subchannel_interleave
+#                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+#  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+#  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+#  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+#  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+#  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+#  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+#
+#  add_armral_bench(
+#    matrix_inv_batch_general
+#    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+#  add_armral_bench(matrix_inv_batch_general_pa
+#                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_inv_batch_hermitian
+#    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_inv_batch_hermitian_pa
+#    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+#  add_armral_bench(matrix_inv_single_general
+#                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+#  add_armral_bench(matrix_inv_single_hermitian
+#                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+#  add_armral_bench(arm_solve_1x2
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+#  add_armral_bench(arm_solve_1x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+#  add_armral_bench(arm_solve_2x2
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+#  add_armral_bench(arm_solve_2x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+#  add_armral_bench(arm_solve_4x4
+#                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_32b
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_32b_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_64b
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_i16_64b_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_f32
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_batch_f32_pa
+#    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_i16_32b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_i16_64b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_2x2_iq
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_2x2
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_4x4_iq
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_f32_4x4
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+#  add_armral_bench(
+#    matmul_f32_general
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_aah_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+#  add_armral_bench(
+#    matrix_mult_ahb_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_i16_32b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_i16_64b
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+#  add_armral_bench(
+#    matrix_vector_mult_32
+#    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+#  add_armral_bench(matrix_pseudo_inv_direct
+#                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+#  add_armral_bench(vec_dot_16
+#                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+#  add_armral_bench(vec_dot_16_2
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+#  add_armral_bench(vec_dot_16_2_32_bit
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+#  add_armral_bench(vec_dot_16_32_bit
+#                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+#  add_armral_bench(vec_dot_32
+#                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+#  add_armral_bench(vec_dot_32_2
+#                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+#  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+#  add_armral_bench(vec_mul_16_2
+#                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+#  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+#  add_armral_bench(vec_mul_32_2
+#                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+#  add_armral_bench(mu_law_compression_14bit
+#                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+#  add_armral_bench(mu_law_compression_8bit
+#                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+#  add_armral_bench(mu_law_compression_9bit
+#                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_14bit
+#                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_8bit
+#                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+#  add_armral_bench(mu_law_decompression_9bit
+#                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_float_compression_12bit
+#    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+#  add_armral_bench(
+#    block_float_compression_14bit
+#    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+#  add_armral_bench(block_float_compression_8bit
+#                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+#  add_armral_bench(block_float_compression_9bit
+#                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_12bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_14bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_8bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_float_decompression_9bit
+#    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_14bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_8bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_compression_9bit
+#    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_14bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_8bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+#  add_armral_bench(
+#    block_scaling_decompression_9bit
+#    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+#  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+#  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+#  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+#  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+#  add_armral_bench(arm_fir_filter_cs16_decimate_2
+#                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+#  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+#  add_armral_bench(arm_fir_filter_cf32_decimate_2
+#                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+  add_armral_bench(tail_biting_convolutional_decoding
+                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_bench(tail_biting_convolutional_encoding
+                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+#  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+#  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+#  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+#  add_armral_bench(polar_rate_matching
+#                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+#  add_armral_bench(polar_rate_recovery
+#                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+#  add_armral_bench(polar_subchannel_deinterleave
+#                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+#  add_armral_bench(polar_subchannel_interleave
+#                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+#  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+#  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+#  add_armral_bench(turbo_rate_matching
+#                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+#  add_armral_bench(turbo_rate_recovery
+#                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+#  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
+# cmake-format: on
+endif()
+
+if(BUILD_EXAMPLES)
+  add_custom_target(make_examples_dir ALL COMMAND ${CMAKE_COMMAND} -E
+                                                  make_directory examples)
+  add_custom_target(examples)
+  add_custom_target(run_examples)
+  add_dependencies(run_examples examples)
+
+  # Any parameters after the first one will be passed as parameters to the
+  # example executable when running it
+  function(add_armral_example EXAMPLE_SOURCE)
+    get_filename_component(EXAMPLE_EXE ${EXAMPLE_SOURCE} NAME_WE)
+    add_executable(${EXAMPLE_EXE} ${EXAMPLE_SOURCE})
+    add_dependencies(${EXAMPLE_EXE} make_examples_dir)
+    set(EXAMPLE_OUTPUT_NAME examples/${EXAMPLE_EXE})
+    set_target_properties(${EXAMPLE_EXE} PROPERTIES OUTPUT_NAME
+                                                    ${EXAMPLE_OUTPUT_NAME})
+
+    target_link_libraries(${EXAMPLE_EXE} armral m)
+
+    add_custom_target(
+      run_${EXAMPLE_EXE}
+      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
+      DEPENDS ${EXAMPLE_EXE})
+    add_dependencies(examples ${EXAMPLE_EXE})
+    add_dependencies(run_examples run_${EXAMPLE_EXE})
+  endfunction()
+
+  # cmake-format: off
+#  add_armral_example(examples/block_float_9b_example.c)
+#  add_armral_example(examples/fft_cf32_example.c 10)
+#  add_armral_example(examples/modulation_example.c)
+#  add_armral_example(examples/polar_example.cpp 128 100 35)
+# cmake-format: on
+endif()
+
+# if(BUILD_SIMULATION) # Include simulation rules and targets This involves
+# building dependencies # like AWGN library and OpenMP
+# add_subdirectory(simulation) endif()
+
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+  set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)
+  set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
+  configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY)
+  add_custom_target(docs COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT})
+  install(
+    DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs/html
+    DESTINATION ${CMAKE_INSTALL_DOCDIR}
+    OPTIONAL)
+endif()
+
+# Create target to uninstall the library
+if(NOT TARGET uninstall)
+  configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
+
+  add_custom_target(
+    uninstall COMMAND ${CMAKE_COMMAND} -P
+                      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+endif()
+
+# Check that the C and C++ compilers are from the same toolchain
+if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
+  message(
+    FATAL_ERROR
+      "CXX and C compiler providers differ. Please specify the same compiler toolchain"
+  )
+endif()
+
+set(COMP_ERR_MSG
+    "Compilation is only supported with GNU versions 7, 8, 9, 10, 11, 12, 13, 14. \
+                  If compilation fails please use one of the supported compilers."
+)
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION
+                                                  VERSION_GREATER 14.2)
+    message(WARNING ${COMP_ERR_MSG})
+  endif()
+else()
+  message(WARNING ${COMP_ERR_MSG})
+endif()
diff --git a/armral_hwy.cmake.in b/armral_hwy.cmake.in
deleted file mode 100644
index a61f82b..0000000
--- a/armral_hwy.cmake.in
+++ /dev/null
@@ -1,176 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-# TODO possibly switch highway from a submodule to ExternalProject_Add
-set(HWY_ENABLE_CONTRIB
-    OFF
-    CACHE BOOL "Include HWY contrib/ folder")
-set(HWY_ENABLE_EXAMPLES
-    OFF
-    CACHE BOOL "Build HWY examples")
-# set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install HWY library")
-set(HWY_ENABLE_TESTS
-    OFF
-    CACHE BOOL "Enable HWY tests")
-
-include(CheckCXXCompilerFlag)
-check_cxx_compiler_flag("-march=native" COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
-
-if(ARMRAL_OPT_FLAGS)
-  set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
-  # handle configuring static dispatch for a specified -m string
-  set(HWY_COMPILE_ONLY_STATIC
-      ON
-      CACHE BOOL "")
-  add_compile_options(${ARMRAL_ARCH_COMPILE_OPTIONS})
-elseif(COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
-  # pick a less conservative baseline where possible
-  add_compile_options("-march=native")
-endif()
-add_subdirectory(highway)
-
-set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_HWY=1")
-
-if(ARMRAL_OPT_FLAGS)
-  target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC)
-endif()
-
-# The PMULL instruction is required for CRC and others which requires the AES
-# extension that is only available under NEON and SVE2 on aarch64. To avoid
-# falling back to generic implementations we fix ourselves on NEON for all Arm
-# platforms
-set_property(
-  SOURCE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS
-           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
-
-# For Demodulation disable SVE/SVE2 due to: The VQRDMULH instruction required by
-# demodulation for fixed point multiplication is only available under NEON and
-# SVE2 on aarch64, therefore, we have disabled SVE for all Arm platforms when
-# VQRDMULH is required; to avoid falling back to (slower) generic
-# implementations. Additionally disable SVE2 for all Arm platforms for
-# demodulation as the SVE implementation of the OrderedDemote2To operation adds
-# a ~40% overhead to demodulation. For Convolutional Decoder disable SVE/SVE2
-# due to: the SVE implementation of the OrderedDemote2To operation adding a ~40%
-# overhead
-set_property(
-  SOURCE
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS
-           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
-
-# GCC recognizes the usage of XOR as an associative operation, then it tries to
-# optimize the operation tree in its tree-reassoc pass, but it actually makes
-# the performance much worse. Disabling the tree-assoc pass means that the
-# compiler uses our carefully balanced operation tree instead.
-set_property(
-  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-  APPEND
-  PROPERTY COMPILE_OPTIONS $<$<C_COMPILER_ID:GNU>:-fno-tree-reassoc>)
-
-set(ARMRAL_LIB_SOURCES
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp
-)
-
-add_library(armral ${ARMRAL_LIB_SOURCES})
-
-target_link_libraries(armral PUBLIC hwy)
-target_link_libraries(armral_utils PUBLIC hwy)
diff --git a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
index 99fe133..d0e3fca 100644
--- a/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/highway/arm_scrambling.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
@@ -10,8 +8,8 @@
 namespace hn = hwy::HWY_NAMESPACE;
 
 HWY_FORCED_INLINE void xor_u8(const uint8_t *__restrict &src,
-                    const uint8_t *__restrict &seq, uint8_t *&dst,
-                    size_t n_lanes) {
+                              const uint8_t *__restrict &seq, uint8_t *&dst,
+                              size_t n_lanes) {
   hn::StoreU(hn::Xor(hn::LoadU(du8, src), hn::LoadU(du8, seq)), du8, dst);
   src += n_lanes;
   seq += n_lanes;
@@ -19,8 +17,8 @@ HWY_FORCED_INLINE void xor_u8(const uint8_t *__restrict &src,
 }
 
 HWY_FORCED_INLINE void xor_u8_partial(const uint8_t *__restrict &src,
-                    const uint8_t *__restrict &seq, uint8_t *&dst,
-                    size_t n_lanes) {
+                                      const uint8_t *__restrict &seq,
+                                      uint8_t *&dst, size_t n_lanes) {
   Mask_u8 final_mask = hn::FirstN(du8, n_lanes);
   Vec_u8 src_vec = no_sanitize::MaskedLoad(final_mask, du8, src);
   Vec_u8 seq_vec = no_sanitize::MaskedLoad(final_mask, du8, seq);
diff --git a/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
index 044b682..3d7293c 100644
--- a/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
+++ b/src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
@@ -19,12 +17,16 @@ static inline void generate_seq_128(uint64_t *x) {
 
   Vec_u64x2 x_vec = hn::Set(du64x2, *x);
 
-  Vec_u64x2 low_20 = hn::ShiftRight<44>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
-  Vec_u64x2 mid_28 = hn::ShiftRight<16>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
+  Vec_u64x2 low_20 =
+      hn::ShiftRight<44>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
+  Vec_u64x2 mid_28 =
+      hn::ShiftRight<16>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
   Vec_u64x2 high_16 = hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[2]));
 
-  Vec_u64x2 intermediate_result = hn::BitwiseIfThenElse(mask_low_20, low_20, mid_28);
-  *x = hn::GetLane(hn::BitwiseIfThenElse(mask_high_16, high_16, intermediate_result));
+  Vec_u64x2 intermediate_result =
+      hn::BitwiseIfThenElse(mask_low_20, low_20, mid_28);
+  *x = hn::GetLane(
+      hn::BitwiseIfThenElse(mask_high_16, high_16, intermediate_result));
 }
 
 template<unsigned int N>
@@ -46,12 +48,16 @@ static inline void generate_seq_64(uint64_t *x) {
 
   Vec_u64x2 x_vec = hn::Set(du64x2, *x);
 
-  Vec_u64x2 low_28 = hn::ShiftRight<36>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
-  Vec_u64x2 mid_28 = hn::ShiftRight<8>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
+  Vec_u64x2 low_28 =
+      hn::ShiftRight<36>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[0])));
+  Vec_u64x2 mid_28 =
+      hn::ShiftRight<8>(hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[1])));
   Vec_u64x2 high_8 = hn::CLMulLower(x_vec, hn::Set(du64x2, pmask[2]));
 
-  Vec_u64x2 intermediate_result = hn::BitwiseIfThenElse(mask_low_28, low_28, mid_28);
-  *x = hn::GetLane(hn::BitwiseIfThenElse(mask_high_8, high_8, intermediate_result));
+  Vec_u64x2 intermediate_result =
+      hn::BitwiseIfThenElse(mask_low_28, low_28, mid_28);
+  *x = hn::GetLane(
+      hn::BitwiseIfThenElse(mask_high_8, high_8, intermediate_result));
 }
 
 armral_status armral_seq_generator(uint32_t sequence_len, uint32_t seed,
@@ -66,8 +72,12 @@ armral_status armral_seq_generator(uint32_t sequence_len, uint32_t seed,
   // Set the first 64 bits x2.
   uint64_t cinit = seed & 0x7fffffff;
   uint64_t x2 = cinit;
-  x2 |= hn::GetLane(hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)), mask_28));
-  x2 |= hn::GetLane(hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)), mask_5));
+  x2 |= hn::GetLane(
+      hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)),
+              mask_28));
+  x2 |= hn::GetLane(
+      hn::And(hn::CLMulLower(hn::Set(du64x2, x2), hn::Set(du64x2, 0xf0000000)),
+              mask_5));
 
   // The sequence x1 is determined according to x1(n+31) = x1(n+3) ^ x1(n)
   // The initial conditions for x1 are x1(0) = 1, x1(n) = 0, n = 1, 2, ..., 30.
diff --git a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
index 5749648..292e9e7 100644
--- a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
@@ -17,8 +15,6 @@
 namespace hn = hwy::HWY_NAMESPACE;
 #include "utils/hwy_types.hpp"
 
-
-
 namespace {
 
 struct pm_s {
@@ -136,14 +132,20 @@ armral_status tail_biting_convolutional_decode_block(
         // bma = abs(s0 - t0)
         Vec_i16x8 bma0_lo = hn::Set(di16x8, 0);
         Vec_i16x8 bma0_hi = hn::Set(di16x8, 0);
-        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t00)));
-        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t00)));
+        bma0_lo = hn::Add(bma0_lo,
+                          hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t00)));
+        bma0_hi = hn::Add(
+            bma0_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t00)));
         // bma += abs(s1 - t1)
-        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t01)));
-        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t01)));
+        bma0_lo = hn::Add(bma0_lo,
+                          hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t01)));
+        bma0_hi = hn::Add(
+            bma0_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t01)));
         // bma += abs(s2 - t2)
-        bma0_lo = hn::Add(bma0_lo, hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t02)));
-        bma0_hi = hn::Add(bma0_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t02)));
+        bma0_lo = hn::Add(bma0_lo,
+                          hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t02)));
+        bma0_hi = hn::Add(
+            bma0_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t02)));
         // The branch metric for the prevb state is computed as 3 - the branch
         // metric of the preva state (Q format, so bmb = 765 - bma)11
         Vec_i16x8 bmb0_lo = hn::Sub(all_765s, bma0_lo);
@@ -151,17 +153,23 @@ armral_status tail_biting_convolutional_decode_block(
 
         // Compute table1 branch metrics (bma and bmb)
 
-        Vec_i8x16 t10 =hn::LoadU(di8x16, table1_0 + j);
-        Vec_i8x16 t11 =hn::LoadU(di8x16, table1_1 + j);
-        Vec_i8x16 t12 =hn::LoadU(di8x16, table1_2 + j);
+        Vec_i8x16 t10 = hn::LoadU(di8x16, table1_0 + j);
+        Vec_i8x16 t11 = hn::LoadU(di8x16, table1_1 + j);
+        Vec_i8x16 t12 = hn::LoadU(di8x16, table1_2 + j);
         Vec_i16x8 bma1_lo = hn::Set(di16x8, 0);
         Vec_i16x8 bma1_hi = hn::Set(di16x8, 0);
-        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t10)));
-        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t10)));
-        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t11)));
-        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t11)));
-        bma1_lo = hn::Add(bma1_lo, hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t12)));
-        bma1_hi = hn::Add(bma1_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t12)));
+        bma1_lo = hn::Add(bma1_lo,
+                          hn::AbsDiff(s0_low, hn::PromoteLowerTo(di16x8, t10)));
+        bma1_hi = hn::Add(
+            bma1_hi, hn::AbsDiff(s0_high, hn::PromoteUpperTo(di16x8, t10)));
+        bma1_lo = hn::Add(bma1_lo,
+                          hn::AbsDiff(s1_low, hn::PromoteLowerTo(di16x8, t11)));
+        bma1_hi = hn::Add(
+            bma1_hi, hn::AbsDiff(s1_high, hn::PromoteUpperTo(di16x8, t11)));
+        bma1_lo = hn::Add(bma1_lo,
+                          hn::AbsDiff(s2_low, hn::PromoteLowerTo(di16x8, t12)));
+        bma1_hi = hn::Add(
+            bma1_hi, hn::AbsDiff(s2_high, hn::PromoteUpperTo(di16x8, t12)));
         Vec_i16x8 bmb1_lo = hn::Sub(all_765s, bma1_lo);
         Vec_i16x8 bmb1_hi = hn::Sub(all_765s, bma1_hi);
 
@@ -173,12 +181,15 @@ armral_status tail_biting_convolutional_decode_block(
         // Load intermediate_sm[prev_state_a] and intermediate_sm[prev_state_b]
         Vec_i32x4 int_sm_0 = hn::Undefined(di32x4);
         Vec_i32x4 int_sm_1 = hn::Undefined(di32x4);
-        hn::LoadInterleaved2(di32x4, &intermediate_sm[2 * j], int_sm_0, int_sm_1);
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[2 * j], int_sm_0,
+                             int_sm_1);
 
         // intermediate_sm[prev_state_a] + bma
-        Vec_i32x4 int_bma0 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, bma0_lo));
+        Vec_i32x4 int_bma0 =
+            hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, bma0_lo));
         // intermediate_sm[prev_state_b] + bmb
-        Vec_i32x4 int_bmb0 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, bmb0_lo));
+        Vec_i32x4 int_bmb0 =
+            hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, bmb0_lo));
         // if (intermediate_sm[prev_state_a] + bm_a >
         //     intermediate_sm[prev_state_b] + bm_b)
         //   prev[j][i] = prev_state_a;
@@ -190,8 +201,10 @@ armral_status tail_biting_convolutional_decode_block(
 
         // Compute table1 state metrics and previous states matrix
 
-        Vec_i32x4 int_bma1 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma1_lo)));
-        Vec_i32x4 int_bmb1 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb1_lo)));
+        Vec_i32x4 int_bma1 =
+            hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma1_lo)));
+        Vec_i32x4 int_bmb1 =
+            hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb1_lo)));
         pred = hn::RebindMask(du32x4, hn::Le(int_bma1, int_bmb1));
         finalsm = hn::Max(int_bmb1, int_bma1);
         hn::StoreU(finalsm, di32x4, &final_sm[j + states / 2]);
@@ -204,7 +217,8 @@ armral_status tail_biting_convolutional_decode_block(
         preva = hn::Add(preva, all_8s);
         prevb = hn::Add(prevb, all_8s);
 
-        hn::LoadInterleaved2(di32x4, &intermediate_sm[8 + 2 * j], int_sm_0, int_sm_1);
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[8 + 2 * j], int_sm_0,
+                             int_sm_1);
 
         int_bma0 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma0_lo));
         int_bmb0 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb0_lo));
@@ -226,7 +240,8 @@ armral_status tail_biting_convolutional_decode_block(
         Vec_u16x8 prevab0l = hn::OrderedDemote2To(du16x8, prevab0ll, prevab0lh);
         Vec_u16x8 prevab1l = hn::OrderedDemote2To(du16x8, prevab1ll, prevab1lh);
 
-        hn::LoadInterleaved2(di32x4, &intermediate_sm[16 + 2 * j], int_sm_0, int_sm_1);
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[16 + 2 * j], int_sm_0,
+                             int_sm_1);
 
         int_bma0 = hn::Add(int_sm_0, hn::PromoteLowerTo(di32x4, (bma0_hi)));
         int_bmb0 = hn::Add(int_sm_1, hn::PromoteLowerTo(di32x4, (bmb0_hi)));
@@ -245,7 +260,8 @@ armral_status tail_biting_convolutional_decode_block(
         preva = hn::Add(preva, all_8s);
         prevb = hn::Add(prevb, all_8s);
 
-        hn::LoadInterleaved2(di32x4, &intermediate_sm[24 + 2 * j], int_sm_0, int_sm_1);
+        hn::LoadInterleaved2(di32x4, &intermediate_sm[24 + 2 * j], int_sm_0,
+                             int_sm_1);
 
         int_bma0 = hn::Add(int_sm_0, hn::PromoteUpperTo(di32x4, bma0_hi));
         int_bmb0 = hn::Add(int_sm_1, hn::PromoteUpperTo(di32x4, bmb0_hi));
diff --git a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
index e029f81..7727178 100644
--- a/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
@@ -1,20 +1,16 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
 #include <cstdlib>
 #include <cstring>
 
-#include <hwy/highway.h>
 #include "utils/hwy_types.hpp"
+#include <hwy/highway.h>
 namespace hn = hwy::HWY_NAMESPACE;
 
-
-
 armral_status armral_tail_biting_convolutional_encode_block(const uint8_t *src,
                                                             uint32_t k,
                                                             uint8_t *dst0,
@@ -42,9 +38,12 @@ armral_status armral_tail_biting_convolutional_encode_block(const uint8_t *src,
     uint64_t x = hn::GetLane(hn::BitCast(du64, xv));
 
     // Compute outputs (skip the first byte)
-    Vec_u64 y0_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
-    Vec_u64 y1_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
-    Vec_u64 y2_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
+    Vec_u64 y0_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
+    Vec_u64 y1_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
+    Vec_u64 y2_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
 
     // Delete the 6 bits tail
     y0_temp = hn::ShiftRight<6>(y0_temp);
@@ -80,9 +79,12 @@ armral_status armral_tail_biting_convolutional_encode_block(const uint8_t *src,
     uint64_t x = hn::GetLane(hn::BitCast(du64, xv_tail));
 
     // Compute outputs (skip the first byte)
-    Vec_u64 y0_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
-    Vec_u64 y1_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
-    Vec_u64 y2_temp = hn::BitCast(du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
+    Vec_u64 y0_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask0)));
+    Vec_u64 y1_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask1)));
+    Vec_u64 y2_temp = hn::BitCast(
+        du64, hn::CLMulLower(hn::Set(du64, x), hn::Set(du64, pmask2)));
 
     // Delete zeros (64 - rem) and tail (6)
     y0_temp = hn::ShiftRightSame(y0_temp, (64 - rem + 6));
diff --git a/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
index 4ae1002..7f55cc5 100644
--- a/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
+++ b/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 
@@ -22,8 +20,8 @@ HWY_FORCED_INLINE int8_t sat_mul_double(int16_t a, int16_t b) {
 }
 
 HWY_FORCED_INLINE Vec_i8 generate_partial_llrs(const Vec_i16 rec_a,
-                                           const Vec_i16 rec_b,
-                                           const Vec_i16 weight_v) {
+                                               const Vec_i16 rec_b,
+                                               const Vec_i16 weight_v) {
   /* Computing L(c[n]/r) and L(c[n+1]/r) */
   Vec_i16 llr16_1a = hn::MulFixedPoint15(rec_a, weight_v);
   Vec_i16 llr16_1b = hn::MulFixedPoint15(rec_b, weight_v);
@@ -83,7 +81,8 @@ armral_demodulation_qpsk(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
+    Vec_i16 rec_a =
+        no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     Vec_i8_half llr8_a = generate_partial_llrs_half_vect(rec_a, weight_v);
 
@@ -142,7 +141,8 @@ armral_demodulation_16qam(const uint32_t n_symbols, const uint16_t ulp,
     Vec_i8 llr8_2b = generate_partial_llrs(rec_a, rec_b, weight_v);
 
     /* Store results for consecutive sets of symbols */
-    hn::StoreInterleaved2(hn::BitCast(du16, llr8_1b), hn::BitCast(du16, llr8_2b), du16, (uint16_t *)p_dst);
+    hn::StoreInterleaved2(hn::BitCast(du16, llr8_1b),
+                          hn::BitCast(du16, llr8_2b), du16, (uint16_t *)p_dst);
 
     // Twice this number of int16 but count is for armral_cmplx_int16_t
     p_src += hn::Lanes(di16);
@@ -164,7 +164,9 @@ armral_demodulation_16qam(const uint32_t n_symbols, const uint16_t ulp,
     Vec_i8_half llr8_2a = generate_partial_llrs_half_vect(rec_a, weight_v);
 
     /* Store results for consecutive sets of symbols */
-    hn::StoreInterleaved2(hn::BitCast(du16_half, llr8_1a), hn::BitCast(du16_half, llr8_2a), du16_half, (uint16_t *)p_dst);
+    hn::StoreInterleaved2(hn::BitCast(du16_half, llr8_1a),
+                          hn::BitCast(du16_half, llr8_2a), du16_half,
+                          (uint16_t *)p_dst);
 
     // Twice this number of int16 but count is for armral_cmplx_int16_t
     p_src += hn::Lanes(di16_half);
@@ -175,7 +177,8 @@ armral_demodulation_16qam(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
+    Vec_i16 rec_a =
+        no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     /* Computing L(c0/r) and L(c1/r) */
     Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
@@ -428,7 +431,8 @@ armral_demodulation_256qam(const uint32_t n_symbols, const uint16_t ulp,
   if (tail_cnt > 0U) {
     size_t iteration_symbols = HWY_MIN(hn::Lanes(di16), tail_cnt * 2);
     Mask_i16 load_mask = hn::FirstN(di16, iteration_symbols);
-    Vec_i16 rec_a = no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
+    Vec_i16 rec_a =
+        no_sanitize::MaskedLoad(load_mask, di16, (const int16_t *)p_src);
 
     /* Computing L(c0/r) and L(c1/r) */
     Vec_i8_half llr8_1a = generate_partial_llrs_half_vect(rec_a, weight_v);
diff --git a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
index 5bcdeda..e48da96 100644
--- a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
+++ b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 
 #include "../ldpc_coding.hpp"
diff --git a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
index 0af220b..f0db1ed 100644
--- a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "../ldpc_coding.hpp"
 #include "../ldpc_tables.hpp"
diff --git a/src/UpperPHY/Modulation/highway/arm_modulation.cpp b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
index 5d7ecc0..d1708b3 100644
--- a/src/UpperPHY/Modulation/highway/arm_modulation.cpp
+++ b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
@@ -1,8 +1,6 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #include "armral.h"
 #include <string.h>
@@ -32,28 +30,36 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   // shuffle to map flip of bit order within bytes of input (there is no
   // instruction for doing this on predicates directly, so we do it on the
   // result instead).
-  Vec_u64 shuf0_64 = hn::MulAdd(hn::Iota(du64, 0), hn::Set(du64, 0x0808080808080808ULL), hn::Set(du64, 0x0001020304050607ULL));
-  Vec_i16 shuf0 = hn::BitCast(di16, hn::InterleaveWholeLower(hn::BitCast(du8, shuf0_64), hn::Set(du8, 0)));
+  Vec_u64 shuf0_64 =
+      hn::MulAdd(hn::Iota(du64, 0), hn::Set(du64, 0x0808080808080808ULL),
+                 hn::Set(du64, 0x0001020304050607ULL));
+  Vec_i16 shuf0 =
+      hn::BitCast(di16, hn::InterleaveWholeLower(hn::BitCast(du8, shuf0_64),
+                                                 hn::Set(du8, 0)));
   while (nbits > n_lanes8) {
     // load predicate as one bit per 8-bit element, then unpack into one bit
     // per 16-bit element and use to select result.
-#if HWY_TARGET & (HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+#if HWY_TARGET & (HWY_SVE2_128 | HWY_SVE2 | HWY_SVE_256 | HWY_SVE)
     // SVE can load mask bits very fast by casting uint8 arrays to svbool
     Mask_u8 in = *(const svbool_t *)p_src;
 #else
     Mask_u8 in = hn::LoadMaskBits(du8, p_src);
 #endif
-    Mask_i16 in_lo = hn::PromoteMaskTo(di16, di16_du8, hn::LowerHalfOfMask(di16_du8, in));
-#if HWY_TARGET & (HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+    Mask_i16 in_lo =
+        hn::PromoteMaskTo(di16, di16_du8, hn::LowerHalfOfMask(di16_du8, in));
+#if HWY_TARGET & (HWY_SVE2_128 | HWY_SVE2 | HWY_SVE_256 | HWY_SVE)
     Mask_i16 in_hi = svunpkhi_b(in);
 #else
-    Mask_i16 in_hi = hn::PromoteMaskTo(di16, di16_du8, hn::UpperHalfOfMask(di16_du8, in));
+    Mask_i16 in_hi =
+        hn::PromoteMaskTo(di16, di16_du8, hn::UpperHalfOfMask(di16_du8, in));
 #endif
     p_src += n_lanes64;
     Vec_i16 vals_lo = hn::IfThenElse(in_lo, svqpsk_neg, svqpsk_pos);
     Vec_i16 vals_hi = hn::IfThenElse(in_hi, svqpsk_neg, svqpsk_pos);
-    Vec_i16 out_lo = hn::TableLookupLanes(vals_lo, hn::IndicesFromVec(di16, shuf0));
-    Vec_i16 out_hi = hn::TableLookupLanes(vals_hi, hn::IndicesFromVec(di16, shuf0));
+    Vec_i16 out_lo =
+        hn::TableLookupLanes(vals_lo, hn::IndicesFromVec(di16, shuf0));
+    Vec_i16 out_hi =
+        hn::TableLookupLanes(vals_hi, hn::IndicesFromVec(di16, shuf0));
     hn::StoreU(out_lo, di16, (int16_t *)p_dst);
     hn::StoreU(out_hi, di16, ((int16_t *)p_dst) + n_lanes16);
     p_dst += n_lanes16;
@@ -63,16 +69,19 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   const uint32_t bytes = nbits >> 3U;
   const uint32_t vl = n_lanes32 / 4;
   const uint32_t unrolls = bytes / vl;
-  const hn::Mask<decltype(du16_du8)> pred = hn::FirstN(du16_du8,  (int32_t)vl);
+  const hn::Mask<decltype(du16_du8)> pred = hn::FirstN(du16_du8, (int32_t)vl);
   const Vec_u16 linear_series = hn::Iota(du16, 0);
-  const Vec_u16 mask = hn::Dup128VecFromValues(du16, 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+  const Vec_u16 mask =
+      hn::Dup128VecFromValues(du16, 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
   Vec_u16 index = hn::InterleaveWholeLower(du16, linear_series, linear_series);
   index = hn::InterleaveWholeLower(du16, index, index);
   index = hn::InterleaveWholeLower(du16, index, index);
   for (uint32_t i = 0; i < unrolls; i++) {
-    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoad(pred, du16_du8, p_src));
+    Vec_u16 src_bytes =
+        hn::PromoteTo(du16, no_sanitize::MaskedLoad(pred, du16_du8, p_src));
     p_src += vl;
-    Vec_u16 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
+    Vec_u16 tbl =
+        hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
     Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
     Vec_i16 points = hn::IfThenElse(mask_pred, svqpsk_neg, svqpsk_pos);
     hn::StoreU(points, di16, (int16_t *)p_dst);
@@ -80,12 +89,15 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   }
 
   const int32_t leftover_bytes = bytes % vl;
-  const hn::Mask<decltype(du16_du8)> load_lanes = hn::FirstN(du16_du8,  leftover_bytes);
+  const hn::Mask<decltype(du16_du8)> load_lanes =
+      hn::FirstN(du16_du8, leftover_bytes);
   const uint32_t active_store_lanes = leftover_bytes * 8;
   if (leftover_bytes != 0) {
-    Vec_u16 src_bytes = hn::PromoteTo(du16, no_sanitize::MaskedLoad(load_lanes, du16_du8, p_src));
+    Vec_u16 src_bytes = hn::PromoteTo(
+        du16, no_sanitize::MaskedLoad(load_lanes, du16_du8, p_src));
     p_src += leftover_bytes;
-    Vec_u16 tbl = hn::TableLookupLanes(src_bytes,  hn::IndicesFromVec(du16, index));
+    Vec_u16 tbl =
+        hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
     Mask_i16 mask_pred = hn::RebindMask(di16, hn::TestBit(tbl, mask));
     Vec_i16 points = hn::IfThenElse(mask_pred, svqpsk_neg, svqpsk_pos);
     hn::StoreN(points, di16, (int16_t *)p_dst, leftover_bytes * 8);
@@ -402,13 +414,13 @@ void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   uint32_t vl = hn::Lanes(du64);
   // At 128 bit vectorisation comes out slower than just using memcpy.
   // With longer vectors, vectorisation should overtake the memcpy method.
-  if (vl > 2) {  // > 128 bit vectors
+  if (vl > 2) { // > 128 bit vectors
     uint32_t unrolls = blk_cnt / vl;
     for (uint32_t i = 0; i < unrolls; i++) {
       Vec_i64 svsample = hn::PromoteTo(di64, hn::LoadU(du64_du8, p_src));
       p_src += vl;
-      Vec_i64 gather = hn::GatherIndex(di64, (const int64_t *)constellation_16qam_outer_prod,
-          svsample);
+      Vec_i64 gather = hn::GatherIndex(
+          di64, (const int64_t *)constellation_16qam_outer_prod, svsample);
       hn::StoreU(gather, di64, (int64_t *)p_dst + i * vl);
     }
 
@@ -417,10 +429,12 @@ void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
       uint32_t tail_size = blk_cnt - i;
       Mask_i64 pred_i64 = hn::FirstN(di64, tail_size);
       hn::Mask<decltype(du64_du8)> pred_u8 = hn::FirstN(du64_du8, tail_size);
-      Vec_i64 svsample = hn::PromoteTo(di64, no_sanitize::MaskedLoad(pred_u8, du64_du8, p_src));
+      Vec_i64 svsample = hn::PromoteTo(
+          di64, no_sanitize::MaskedLoad(pred_u8, du64_du8, p_src));
       p_src += blk_cnt - i;
-      Vec_i64 gather = hn::MaskedGatherIndex(pred_i64, di64,
-          (const int64_t *)constellation_16qam_outer_prod, svsample);
+      Vec_i64 gather = hn::MaskedGatherIndex(
+          pred_i64, di64, (const int64_t *)constellation_16qam_outer_prod,
+          svsample);
       hn::StoreN(gather, di64, (int64_t *)p_dst + i, tail_size);
     }
 
@@ -438,7 +452,7 @@ void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
     while (blk_cnt > 0U) {
       uint8_t sample = *p_src++;
       memcpy(p_dst, constellation_16qam_outer_prod + sample,
-            2 * sizeof(armral_cmplx_int16_t));
+             2 * sizeof(armral_cmplx_int16_t));
       p_dst += 2;
       blk_cnt--;
     }
@@ -515,14 +529,17 @@ void armral_64qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   // 3 bytes (24 bits) read in.
   const uint32_t vl = (hn::Lanes(di32) * 3) / 4;
   Mask_u8 pred = hn::FirstN(du8, vl);
-  Vec_u32 index = hn::MulAdd(hn::Iota(du32, 0), hn::Set(du32, 0x00030303), hn::Set(du32, 0x00000102));
+  Vec_u32 index = hn::MulAdd(hn::Iota(du32, 0), hn::Set(du32, 0x00030303),
+                             hn::Set(du32, 0x00000102));
   index = hn::InterleaveWholeLower(index, index);
   index = hn::InterleaveWholeLower(index, index);
-  Vec_i32 byte_mask = hn::Dup128VecFromValues(di32, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
+  Vec_i32 byte_mask = hn::Dup128VecFromValues(di32, 0x00FFFFFF, 0x00FFFFFF,
+                                              0x00FFFFFF, 0x00FFFFFF);
   const uint32_t svunroll_cnt = bytes / vl;
   for (uint32_t i = 0; i < svunroll_cnt; i++) {
     Vec_u8 src_bytes = no_sanitize::MaskedLoad(pred, du8, p_src);
-    Vec_u8 tbl = hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du8, hn::BitCast(du8, index)));
+    Vec_u8 tbl = hn::TableLookupLanes(
+        src_bytes, hn::IndicesFromVec(du8, hn::BitCast(du8, index)));
     Vec_i32 data = hn::BitCast(di32, tbl);
     // Mask out the most significant byte of each 32 bit value.
     // On Arm this can be achieved directly in the table lookup but this is not
@@ -531,7 +548,8 @@ void armral_64qam_modulation(const uint32_t nbits, const uint8_t *p_src,
     Vec_i32 shift = hn::Dup128VecFromValues(di32, 18, 12, 6, 0);
     data = hn::And(hn::Shr(data, shift), hn::Set(di32, 0x3f));
     p_src += vl;
-    Vec_u32 gather = hn::GatherIndex(du32, (const uint32_t *)constellation_64qam, data);
+    Vec_u32 gather =
+        hn::GatherIndex(du32, (const uint32_t *)constellation_64qam, data);
     hn::StoreU(gather, du32, (uint32_t *)p_dst);
     p_dst += hn::Lanes(du32);
   }
@@ -718,7 +736,8 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   for (uint32_t i = 0; i < unr_cnt; i++) {
     Vec_i32 index = hn::PromoteTo(di32, hn::LoadU(di32_du8, p_src));
     p_src += vl;
-    Vec_i32 gather = hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
+    Vec_i32 gather =
+        hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
     hn::StoreU(gather, di32, (int32_t *)p_dst);
     p_dst += vl;
   }
@@ -726,8 +745,10 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   const uint32_t leftover_bytes = bytes - unr_cnt * vl;
   if (leftover_bytes != 0U) {
     hn::Mask<decltype(di32_du8)> pred = hn::FirstN(di32_du8, leftover_bytes);
-    Vec_i32 index = hn::PromoteTo(di32, no_sanitize::MaskedLoad(pred, di32_du8, p_src));
-    Vec_i32 gather = hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
+    Vec_i32 index =
+        hn::PromoteTo(di32, no_sanitize::MaskedLoad(pred, di32_du8, p_src));
+    Vec_i32 gather =
+        hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
     hn::StoreN(gather, di32, (int32_t *)p_dst, leftover_bytes);
   }
 }
diff --git a/src/utils/acle/bits_to_bytes.hpp b/src/utils/acle/bits_to_bytes.hpp
deleted file mode 100644
index 6769f5c..0000000
--- a/src/utils/acle/bits_to_bytes.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
-    Arm RAN Acceleration Library
-    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-*/
-#pragma once
-
-#include "intrinsics.h"
-#include <arm_neon.h>
-#include <vector>
-
-namespace armral {
-
-// Given a byte array, where we are interested in each bit, create
-// an array of bytes instead in the passed-in array "out"
-// Data is read from the most significant bit in each byte to the least
-// significant
-inline void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
-  uint32_t full_bytes = n >> 3;
-  // Set the mask
-  uint8x16_t mask = vdupq_n_u8(1);
-  // When shifting, we are going to be dealing with two bytes at a time
-  int8x16_t shifts = {-7, -6, -5, -4, -3, -2, -1, 0,
-                      -7, -6, -5, -4, -3, -2, -1, 0};
-  // Set the base index for the bytes to 0 x 8, 1 x 8
-  uint64x2_t base_index = {0x0, 0x0101010101010101};
-  // Increment the index by two each iteration
-  uint8x16_t two = vdupq_n_u8(2);
-  uint32_t i = 0;
-  for (; i + 8 < full_bytes; i += 8) {
-    // Load 8 bytes into an uint8x16_t
-    uint8x16_t bytes = vld1d_u8(&in[i]);
-
-    uint8x16_t index = vreinterpretq_u8_u64(base_index);
-    // We can unroll by a factor 2 by using vqtbl1q
-    for (int byte_ind = 0; byte_ind < 8; byte_ind += 2) {
-      uint8x16_t byte = vqtbl1q_u8(bytes, index);
-      // Shift the bits we want to convert into the rightmost position, and mask
-      // with 1
-      uint8x16_t new_byte = vshlq_u8(byte, shifts);
-      new_byte = vandq_u8(new_byte, mask);
-      // Next loop
-      index = vaddq_u8(index, two);
-      vst1q_u8(&out[8 * (i + byte_ind)], new_byte);
-    }
-  }
-
-  // Deal with a vector tail
-  uint8x8_t mask_tail = vdup_n_u8(1);
-  int8x8_t shift_tail = {-7, -6, -5, -4, -3, -2, -1, 0};
-  for (; i < full_bytes; ++i) {
-    // Load a byte and duplicate to 8 lanes of a vector
-    uint8x8_t byte = vld1_dup_u8(&in[i]);
-    // Shift the bit we want in each lane to the right-most
-    // position, and mask with 1
-    uint8x8_t new_bytes = vshl_u8(byte, shift_tail);
-    new_bytes = vand_u8(new_bytes, mask_tail);
-    vst1_u8(&out[8 * i], new_bytes);
-  }
-
-  // Now deal with a scalar tail
-  if ((n & 7) != 0) {
-    uint8_t byte = in[full_bytes];
-    uint32_t bit_ind = 0;
-    for (uint32_t j = 8 * i; j < n; ++j, ++bit_ind) {
-      out[j] = (byte >> (7 - bit_ind)) & 1;
-    }
-  }
-}
-
-// Given a byte array, where we are interested in each bit, create
-// an array of bytes instead and return it in a std::vector
-inline std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
-  std::vector<uint8_t> out(n);
-  bits_to_bytes(n, in, out.data());
-  return out;
-}
-
-// Given a byte array of zeros and ones, write this out to
-// consecutive bits instead. Bytes are assumed to be big endian
-// so the first bit in a byte goes to the highest bit position
-inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
-  uint32_t full_bytes = n >> 3;
-  uint32_t tail_bits = n & 7;
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    out[i] = (in[i * 8] & 1) << 7;
-    for (uint32_t j = 1; j < 8; ++j) {
-      out[i] |= (in[i * 8 + j] & 1) << (7 - j);
-    }
-  }
-  if (tail_bits != 0) {
-    out[full_bytes] = (in[full_bytes * 8] & 1) << 7;
-    for (uint32_t j = 1; j < tail_bits; ++j) {
-      out[full_bytes] |= (in[full_bytes * 8 + j] & 1) << (7 - j);
-    }
-  }
-}
-
-// Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
-// negative, otherwise to 0. We do not assume that the data_out pointer is
-// initialized
-template<typename T>
-inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
-  uint32_t full_bytes = n >> 3;
-  uint32_t tail_bits = n & 7;
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    data_out[i] = 0;
-    for (uint32_t j = 0; j < 8; ++j) {
-      // The first bit to write in the byte is the most significant
-      if (llr[i * 8 + j] < 0) {
-        uint32_t bit_ind = 7 ^ j;
-        data_out[i] |= 1 << bit_ind;
-      }
-    }
-  }
-  // Deal with tail bits
-  if (tail_bits != 0) {
-    data_out[full_bytes] = 0;
-    for (uint32_t i = 0; i < tail_bits; ++i) {
-      // The first bit to write in the byte is the most significant
-      if (llr[full_bytes * 8 + i] < 0) {
-        uint32_t bit_ind = 7 ^ i;
-        data_out[full_bytes] |= 1 << bit_ind;
-      }
-    }
-  }
-}
-
-} // namespace armral
diff --git a/src/utils/bits_to_bytes.hpp b/src/utils/bits_to_bytes.hpp
index 99ecdd9..b7ceabd 100644
--- a/src/utils/bits_to_bytes.hpp
+++ b/src/utils/bits_to_bytes.hpp
@@ -1,13 +1,133 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-#ifndef ARMRAL_ARCH_HWY
-#include "acle/bits_to_bytes.hpp"
-#else
+#ifdef ARMRAL_ARCH_HWY
 #include "highway/bits_to_bytes.hpp"
+#else
+
+#include "intrinsics.h"
+#include <arm_neon.h>
+#include <vector>
+
+namespace armral {
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead in the passed-in array "out"
+// Data is read from the most significant bit in each byte to the least
+// significant
+inline void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
+  uint32_t full_bytes = n >> 3;
+  // Set the mask
+  uint8x16_t mask = vdupq_n_u8(1);
+  // When shifting, we are going to be dealing with two bytes at a time
+  int8x16_t shifts = {-7, -6, -5, -4, -3, -2, -1, 0,
+                      -7, -6, -5, -4, -3, -2, -1, 0};
+  // Set the base index for the bytes to 0 x 8, 1 x 8
+  uint64x2_t base_index = {0x0, 0x0101010101010101};
+  // Increment the index by two each iteration
+  uint8x16_t two = vdupq_n_u8(2);
+  uint32_t i = 0;
+  for (; i + 8 < full_bytes; i += 8) {
+    // Load 8 bytes into an uint8x16_t
+    uint8x16_t bytes = vld1d_u8(&in[i]);
+
+    uint8x16_t index = vreinterpretq_u8_u64(base_index);
+    // We can unroll by a factor 2 by using vqtbl1q
+    for (int byte_ind = 0; byte_ind < 8; byte_ind += 2) {
+      uint8x16_t byte = vqtbl1q_u8(bytes, index);
+      // Shift the bits we want to convert into the rightmost position, and mask
+      // with 1
+      uint8x16_t new_byte = vshlq_u8(byte, shifts);
+      new_byte = vandq_u8(new_byte, mask);
+      // Next loop
+      index = vaddq_u8(index, two);
+      vst1q_u8(&out[8 * (i + byte_ind)], new_byte);
+    }
+  }
+
+  // Deal with a vector tail
+  uint8x8_t mask_tail = vdup_n_u8(1);
+  int8x8_t shift_tail = {-7, -6, -5, -4, -3, -2, -1, 0};
+  for (; i < full_bytes; ++i) {
+    // Load a byte and duplicate to 8 lanes of a vector
+    uint8x8_t byte = vld1_dup_u8(&in[i]);
+    // Shift the bit we want in each lane to the right-most
+    // position, and mask with 1
+    uint8x8_t new_bytes = vshl_u8(byte, shift_tail);
+    new_bytes = vand_u8(new_bytes, mask_tail);
+    vst1_u8(&out[8 * i], new_bytes);
+  }
+
+  // Now deal with a scalar tail
+  if ((n & 7) != 0) {
+    uint8_t byte = in[full_bytes];
+    uint32_t bit_ind = 0;
+    for (uint32_t j = 8 * i; j < n; ++j, ++bit_ind) {
+      out[j] = (byte >> (7 - bit_ind)) & 1;
+    }
+  }
+}
+
+// Given a byte array, where we are interested in each bit, create
+// an array of bytes instead and return it in a std::vector
+inline std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
+  std::vector<uint8_t> out(n);
+  bits_to_bytes(n, in, out.data());
+  return out;
+}
+
+// Given a byte array of zeros and ones, write this out to
+// consecutive bits instead. Bytes are assumed to be big endian
+// so the first bit in a byte goes to the highest bit position
+inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    out[i] = (in[i * 8] & 1) << 7;
+    for (uint32_t j = 1; j < 8; ++j) {
+      out[i] |= (in[i * 8 + j] & 1) << (7 - j);
+    }
+  }
+  if (tail_bits != 0) {
+    out[full_bytes] = (in[full_bytes * 8] & 1) << 7;
+    for (uint32_t j = 1; j < tail_bits; ++j) {
+      out[full_bytes] |= (in[full_bytes * 8 + j] & 1) << (7 - j);
+    }
+  }
+}
+
+// Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
+// negative, otherwise to 0. We do not assume that the data_out pointer is
+// initialized
+template<typename T>
+inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  uint32_t tail_bits = n & 7;
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    data_out[i] = 0;
+    for (uint32_t j = 0; j < 8; ++j) {
+      // The first bit to write in the byte is the most significant
+      if (llr[i * 8 + j] < 0) {
+        uint32_t bit_ind = 7 ^ j;
+        data_out[i] |= 1 << bit_ind;
+      }
+    }
+  }
+  // Deal with tail bits
+  if (tail_bits != 0) {
+    data_out[full_bytes] = 0;
+    for (uint32_t i = 0; i < tail_bits; ++i) {
+      // The first bit to write in the byte is the most significant
+      if (llr[full_bytes * 8 + i] < 0) {
+        uint32_t bit_ind = 7 ^ i;
+        data_out[full_bytes] |= 1 << bit_ind;
+      }
+    }
+  }
+}
+
+} // namespace armral
 #endif
\ No newline at end of file
diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp
index 633667d..615f2b3 100644
--- a/src/utils/highway/bits_to_bytes.hpp
+++ b/src/utils/highway/bits_to_bytes.hpp
@@ -1,13 +1,11 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-    Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-#include <hwy/highway.h>
 #include "utils/hwy_types.hpp"
+#include <hwy/highway.h>
 namespace hn = hwy::HWY_NAMESPACE;
 #include <cmath>
 #include <vector>
@@ -18,17 +16,18 @@ namespace armral {
 // an array of bytes instead in the passed-in array "out"
 // Data is read from the most significant bit in each byte to the least
 // significant
-HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out) {
+HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in,
+                                     uint8_t *out) {
   const size_t bits_per_byte = 8;
   const size_t num_vecs = n / (hn::Lanes(du8x16) * bits_per_byte);
   const uint32_t tail_bits = n % (hn::Lanes(du8x16) * bits_per_byte);
   const uint32_t final_bits = n % hn::Lanes(du8x16);
 
   const Vec_u8x16 k1 = hn::Set(du8x16, uint8_t{0x01});
-  const Vec_u8x16 shifts = hn::Dup128VecFromValues(du8x16, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6,
-                                              5, 4, 3, 2, 1, 0);
-  const Vec_u8x16 base_indices = hn::Dup128VecFromValues(du8x16, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                    1, 1, 1, 1, 1, 1, 1, 1);
+  const Vec_u8x16 shifts = hn::Dup128VecFromValues(du8x16, 7, 6, 5, 4, 3, 2, 1,
+                                                   0, 7, 6, 5, 4, 3, 2, 1, 0);
+  const Vec_u8x16 base_indices = hn::Dup128VecFromValues(
+      du8x16, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
 
   // Process in 128-bit blocks
   for (size_t i = 0; i < num_vecs; i++) {
@@ -64,7 +63,8 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out
       Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
       bool store_remainder = (byte_ind + 2) >= remaining_bytes;
       hn::StoreN(spread_bits, du8x16, out,
-                 (store_remainder && final_bits) ? final_bits : hn::Lanes(du8x16));
+                 (store_remainder && final_bits) ? final_bits
+                                                 : hn::Lanes(du8x16));
       out += hn::Lanes(du8x16);
     }
   }
@@ -72,7 +72,8 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, uint8_t *out
 
 // Given a byte array, where we are interested in each bit, create
 // an array of bytes instead and return it in a std::vector
-HWY_FORCED_INLINE std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *in) {
+HWY_FORCED_INLINE std::vector<uint8_t> bits_to_bytes(uint32_t n,
+                                                     const uint8_t *in) {
   std::vector<uint8_t> out(n);
   bits_to_bytes(n, in, out.data());
   return out;
@@ -81,7 +82,8 @@ HWY_FORCED_INLINE std::vector<uint8_t> bits_to_bytes(uint32_t n, const uint8_t *
 // Given a byte array of zeros and ones, write this out to
 // consecutive bits instead. Bytes are assumed to be big endian
 // so the first bit in a byte goes to the highest bit position
-HWY_FORCED_INLINE void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
+HWY_FORCED_INLINE void bytes_to_bits(uint32_t n, const uint8_t *in,
+                                     uint8_t *out) {
   uint32_t full_bytes = n >> 3;
   uint32_t tail_bits = n & 7;
   for (uint32_t i = 0; i < full_bytes; ++i) {
@@ -102,7 +104,8 @@ HWY_FORCED_INLINE void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out
 // negative, otherwise to 0. We do not assume that the data_out pointer is
 // initialized
 template<typename T>
-HWY_FORCED_INLINE void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
+HWY_FORCED_INLINE void llrs_to_bits(uint32_t n, const T *llr,
+                                    uint8_t *data_out) {
   uint32_t full_bytes = n >> 3;
   uint32_t tail_bits = n & 7;
   for (uint32_t i = 0; i < full_bytes; ++i) {
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index c7ffccd..22e363e 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -1,8 +1,6 @@
 /*
    Arm RAN Acceleration Library
    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2023-2025
-   Cambridge Consultants Project Reference P5851
 */
 
 /*
-- 
GitLab


From c0a699f60d494ce770ffd9da0cc1661f764f06c6 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Mon, 20 Jan 2025 10:44:25 +0000
Subject: [PATCH 17/20] Merge back changes requested by Arm in CRC review

---
 CREDITS.md                              |  11 +
 src/UpperPHY/CRC/acle/crc_common.hpp    | 271 -----------------------
 src/UpperPHY/CRC/crc_common.hpp         | 279 +++++++++++++++++++++++-
 src/UpperPHY/CRC/highway/crc_common.hpp |   4 +-
 4 files changed, 285 insertions(+), 280 deletions(-)
 delete mode 100644 src/UpperPHY/CRC/acle/crc_common.hpp

diff --git a/CREDITS.md b/CREDITS.md
index 1249662..3e5abe7 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -15,6 +15,17 @@ Acceleration Library:
     have been contributed by Cambridge Consultants. See
     <https://gitlab.arm.com/networking/ral/-/merge_requests/28>.
 
+- Addition of the Google Highway crc implementation in
+  `src/UpperPHY/CRC/highway/crc_common.hpp` was contributed by
+  Cambridge Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
+  
+- Addition of a Google Highway as a fourth architecture `-DARMRAL_ARCH=HWY`.
+  Enabling future development using Google Highway platform agnostic
+  intrinsic implementations was contributed upstream by Cambridge
+  Consultants. See
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/27>.
+
 - Work on `armral_ldpc_rate_recovery` to correctly set the
   log-likelihood ratios of filler bits was contributed upstream by
   4g5g Consultants. See
diff --git a/src/UpperPHY/CRC/acle/crc_common.hpp b/src/UpperPHY/CRC/acle/crc_common.hpp
deleted file mode 100644
index 47bf69e..0000000
--- a/src/UpperPHY/CRC/acle/crc_common.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
-    Arm RAN Acceleration Library
-    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-*/
-#pragma once
-
-#include <arm_neon.h>
-
-static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
-  // Sometimes compilers don't realize that they don't need an extra
-  // instruction to extract the 0th lane of a vector, e.g. when doing
-  // vmull_p64(a[0], b[0]), so this just gets around that.
-  poly128_t res;
-  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
-  // If vmull_high_p64 is used, then clang might use a mov to general
-  // purpose registers and back follow by a pmull. This forces the use
-  // of a single pmull2 instruction instead.
-  poly128_t res;
-  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
-  return res;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-template<char Endianness>
-static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
-  poly64x2_t vec = vld1q_dup_p64(p_in);
-  if (Endianness == 'b') {
-    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
-  }
-  return vec;
-}
-
-static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
-  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
-  // 1. It isn't available on the earliest GCC version we currently support
-  // 2. If GCC recognizes that this is an associative operation, then it tries
-  //    to optimize the operation tree in its tree-reassoc pass, but it
-  //    actually makes the performance much worse. Hiding it in assembly means
-  //    that the compiler uses our carefully balanced operation tree instead.
-  uint8x16_t res;
-  asm("eor %0.16b, %1.16b, %2.16b"
-      : "=w"(res)
-      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
-  return (poly64x2_t)res;
-}
-
-/**
- * Computes a CRC64 in big- or little-endian mode using the specified shifts
- * and polynomials. This can be used for smaller polynomials by shifting
- * them to a degree 64 polynomial.
- *
- * @tparam     BarretShift     the shift used when computing @c ls1_divp.
- * @param[in]  size            number of bytes of the given buffer
- * @param[in]  input           points to the input byte sequence
- * @param[out] crc             the computed CRC
- * @param[in]  constants       the constants specific to each polynomial:
-                               constants[0] = padding
-                               constants[1] = (1<<128) / P_CRC - (1<<64)
-                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
-                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
- */
-template<char Endianness>
-static inline __attribute__((always_inline)) void
-crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
-      const poly64_t constants[]) {
-  const poly64_t *p_in = (const poly64_t *)input;
-
-  if (size == 8) {
-    // Special case for <=64 bits
-    poly64x2_t divp_p = vld1q_p64(&constants[1]);
-
-    // This might compile to a separate ldr and dup, which is
-    // fine because the operation using the upper half depends
-    // on the output of the operation using the lower half.
-    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
-    vb = add_p64x2(vb, v11);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load constants for size = 16
-  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
-  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
-  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
-
-  if (size == 16) {
-    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-    poly64x2_t vx1 = add_p64x2(v01, v21);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // Load the rest of the constants
-  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
-  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
-  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
-
-  if (size == 32) {
-    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
-    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
-    poly64x2_t v01 = add_p64x2(v01a, v01e);
-    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
-  uint32_t init_bytes = size % 64;
-  const poly64_t *p_end = p_in + (size - 16) / 8;
-
-  // These values are carried forwards to the next loop iteration each time.
-  poly64x2_t v01;
-
-  if (init_bytes == 16) {
-    v01 = vdupq_n_p64(0);
-    p_in += 8;
-  } else if (init_bytes == 32) {
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
-    p_in += 10;
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01 = add_p64x2(v01a, v01e);
-  } else if (init_bytes == 48) {
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
-    p_in += 12;
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-
-  } else {
-    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
-    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
-    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
-    p_in += 14;
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
-
-  if (size <= 64) {
-    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
-    v01 = add_p64x2(v01, v01a);
-    poly64x2_t vx1 = add_p64x2(v01, v19);
-
-    // Barret reduction
-    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-    vb = add_p64x2(vb, vx1);
-    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-    v0x = add_p64x2(v0x, v01);
-    *crc = (uint64_t)(v0x[0]);
-    return;
-  }
-
-  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
-  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
-  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
-
-  while (p_in < p_end) {
-    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-    poly64x2_t vx9 = add_p64x2(v01, v19);
-    poly64x2_t v8x = add_p64x2(v87, v01);
-
-    v19 = load_p64x2<Endianness>(p_in);
-    v87 = load_p64x2<Endianness>(p_in + 2);
-
-    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-    v01b = add_p64x2(v01b, v01bb);
-
-    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-
-    v65 = load_p64x2<Endianness>(p_in + 4);
-    v43 = load_p64x2<Endianness>(p_in + 6);
-    p_in += 8;
-
-    v01a = add_p64x2(v01a, v01aa);
-    v01c = add_p64x2(v01c, v01d);
-    v01a = add_p64x2(v01a, v01b);
-    v01e = add_p64x2(v01e, v01g);
-    v01a = add_p64x2(v01a, v01c);
-    v01 = add_p64x2(v01a, v01e);
-  }
-
-  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
-
-  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
-  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
-  poly64x2_t vx9 = add_p64x2(v01, v19);
-  poly64x2_t v8x = add_p64x2(v87, v01);
-
-  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
-  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
-
-  v01f = add_p64x2(v01f, v01ff);
-  v01e = add_p64x2(v01e, v01ee);
-  v01e = add_p64x2(v01e, v01f);
-
-  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
-  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
-  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
-  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
-  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
-
-  v01c = add_p64x2(v01c, v01d);
-  v01a = add_p64x2(v01a, v01b);
-  v01e = add_p64x2(v01e, v01g);
-  v01a = add_p64x2(v01a, v01c);
-  v01 = add_p64x2(v01a, v01e);
-
-  poly64x2_t vx1 = add_p64x2(v01, v21);
-
-  // Barret reduction
-  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
-  vb = add_p64x2(vb, vx1);
-  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
-  v0x = add_p64x2(v0x, v01);
-  *crc = (uint64_t)(v0x[0]);
-}
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index e65d27b..e97b466 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,11 +1,278 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-   Cambridge Consultants Project Reference P5851
 */
-#ifndef ARMRAL_ARCH_HWY
-#include "acle/crc_common.hpp"
-#else
+
+#pragma once
+
+#ifdef ARMRAL_ARCH_HWY
 #include "highway/crc_common.hpp"
-#endif
+#else
+
+#include <arm_neon.h>
+
+static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
+  // Sometimes compilers don't realize that they don't need an extra
+  // instruction to extract the 0th lane of a vector, e.g. when doing
+  // vmull_p64(a[0], b[0]), so this just gets around that.
+  poly128_t res;
+  asm("pmull %0.1q, %1.1d, %2.1d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+static inline poly128_t vmull_force_high_p64(poly64x2_t a, poly64x2_t b) {
+  // If vmull_high_p64 is used, then clang might use a mov to general
+  // purpose registers and back follow by a pmull. This forces the use
+  // of a single pmull2 instruction instead.
+  poly128_t res;
+  asm("pmull2 %0.1q, %1.2d, %2.2d" : "=w"(res) : "w"(a), "w"(b));
+  return res;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_p64x2(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+template<char Endianness>
+static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
+  poly64x2_t vec = vld1q_dup_p64(p_in);
+  if (Endianness == 'b') {
+    vec = (poly64x2_t)vrev64q_u8((uint8x16_t)vec);
+  }
+  return vec;
+}
+
+static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
+  // There are two reasons why we can't just use the vaddq_p64 intrinsic:
+  // 1. It isn't available on the earliest GCC version we currently support
+  // 2. If GCC recognizes that this is an associative operation, then it tries
+  //    to optimize the operation tree in its tree-reassoc pass, but it
+  //    actually makes the performance much worse. Hiding it in assembly means
+  //    that the compiler uses our carefully balanced operation tree instead.
+  uint8x16_t res;
+  asm("eor %0.16b, %1.16b, %2.16b"
+      : "=w"(res)
+      : "w"((uint8x16_t)a), "w"((uint8x16_t)b));
+  return (poly64x2_t)res;
+}
+
+/**
+ * Computes a CRC64 in big- or little-endian mode using the specified shifts
+ * and polynomials. This can be used for smaller polynomials by shifting
+ * them to a degree 64 polynomial.
+ *
+ * @tparam     BarretShift     the shift used when computing @c ls1_divp.
+ * @param[in]  size            number of bytes of the given buffer
+ * @param[in]  input           points to the input byte sequence
+ * @param[out] crc             the computed CRC
+ * @param[in]  constants       the constants specific to each polynomial:
+                               constants[0] = padding
+                               constants[1] = (1<<128) / P_CRC - (1<<64)
+                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
+                                 for k in [1,1,2,3,4,5,6,7,8,9] ]
+ */
+template<char Endianness>
+static inline __attribute__((always_inline)) void
+crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
+      const poly64_t constants[]) {
+  const poly64_t *p_in = (const poly64_t *)input;
+
+  if (size == 8) {
+    // Special case for <=64 bits
+    poly64x2_t divp_p = vld1q_p64(&constants[1]);
+
+    // This might compile to a separate ldr and dup, which is
+    // fine because the operation using the upper half depends
+    // on the output of the operation using the lower half.
+    poly64x2_t v11 = load_dup_p64<Endianness>(p_in);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_low_p64(v11, divp_p);
+    vb = add_p64x2(vb, v11);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, divp_p);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load constants for size = 16
+  poly64x2_t lsamodp_divp = vld1q_p64(&constants[0]);
+  poly64x2_t ls11modp = vld1q_p64(&constants[2]);
+  poly64x2_t ls23modp = vld1q_p64(&constants[4]);
+
+  if (size == 16) {
+    poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v01 = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+    poly64x2_t vx1 = add_p64x2(v01, v21);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // Load the rest of the constants
+  poly64x2_t ls45modp = vld1q_p64(&constants[6]);
+  poly64x2_t ls67modp = vld1q_p64(&constants[8]);
+  poly64x2_t ls89modp = vld1q_p64(&constants[10]);
+
+  if (size == 32) {
+    poly64x2_t v43a = load_p64x2<Endianness>(p_in);
+    poly64x2_t v19 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43a, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43a, ls23modp);
+    poly64x2_t v01 = add_p64x2(v01a, v01e);
+    v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit
+  uint32_t init_bytes = size % 64;
+  const poly64_t *p_end = p_in + (size - 16) / 8;
+
+  // These values are carried forwards to the next loop iteration each time.
+  poly64x2_t v01;
+
+  if (init_bytes == 16) {
+    v01 = vdupq_n_p64(0);
+    p_in += 8;
+  } else if (init_bytes == 32) {
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in);
+    p_in += 10;
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01 = add_p64x2(v01a, v01e);
+  } else if (init_bytes == 48) {
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 2);
+    p_in += 12;
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+
+  } else {
+    poly64x2_t v87 = load_p64x2<Endianness>(p_in);
+    poly64x2_t v65 = load_p64x2<Endianness>(p_in + 2);
+    poly64x2_t v43 = load_p64x2<Endianness>(p_in + 4);
+    p_in += 14;
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v87, ls89modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v19 = load_p64x2<Endianness>(p_in - 8);
+
+  if (size <= 64) {
+    poly64x2_t v01a = (poly64x2_t)vmull_force_low_p64(v19, ls23modp);
+    v01 = add_p64x2(v01, v01a);
+    poly64x2_t vx1 = add_p64x2(v01, v19);
+
+    // Barret reduction
+    poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+    vb = add_p64x2(vb, vx1);
+    poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+    v0x = add_p64x2(v0x, v01);
+    *crc = (uint64_t)(v0x[0]);
+    return;
+  }
+
+  poly64x2_t v87 = load_p64x2<Endianness>(p_in - 6);
+  poly64x2_t v65 = load_p64x2<Endianness>(p_in - 4);
+  poly64x2_t v43 = load_p64x2<Endianness>(p_in - 2);
+
+  while (p_in < p_end) {
+    poly64x2_t v01bb = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+    poly64x2_t v01b = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+    poly64x2_t vx9 = add_p64x2(v01, v19);
+    poly64x2_t v8x = add_p64x2(v87, v01);
+
+    v19 = load_p64x2<Endianness>(p_in);
+    v87 = load_p64x2<Endianness>(p_in + 2);
+
+    poly64x2_t v01g = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+    poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+    v01b = add_p64x2(v01b, v01bb);
+
+    poly64x2_t v01aa = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+    poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+    poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+    poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+
+    v65 = load_p64x2<Endianness>(p_in + 4);
+    v43 = load_p64x2<Endianness>(p_in + 6);
+    p_in += 8;
+
+    v01a = add_p64x2(v01a, v01aa);
+    v01c = add_p64x2(v01c, v01d);
+    v01a = add_p64x2(v01a, v01b);
+    v01e = add_p64x2(v01e, v01g);
+    v01a = add_p64x2(v01a, v01c);
+    v01 = add_p64x2(v01a, v01e);
+  }
+
+  poly64x2_t v21 = load_p64x2<Endianness>(p_in);
+
+  poly64x2_t v01ff = (poly64x2_t)vmull_force_low_p64(v19, lsamodp_divp);
+  poly64x2_t v01f = (poly64x2_t)vmull_force_high_p64(v87, ls67modp);
+  poly64x2_t vx9 = add_p64x2(v01, v19);
+  poly64x2_t v8x = add_p64x2(v87, v01);
+
+  poly64x2_t v01ee = (poly64x2_t)vmull_force_high_p64(vx9, ls89modp);
+  poly64x2_t v01e = (poly64x2_t)vmull_force_low_p64(v8x, ls89modp);
+
+  v01f = add_p64x2(v01f, v01ff);
+  v01e = add_p64x2(v01e, v01ee);
+  v01e = add_p64x2(v01e, v01f);
+
+  poly64x2_t v01d = (poly64x2_t)vmull_force_low_p64(v65, ls67modp);
+  poly64x2_t v01c = (poly64x2_t)vmull_force_high_p64(v65, ls45modp);
+  poly64x2_t v01b = (poly64x2_t)vmull_force_low_p64(v43, ls45modp);
+  poly64x2_t v01a = (poly64x2_t)vmull_force_high_p64(v43, ls23modp);
+  poly64x2_t v01g = (poly64x2_t)vmull_force_low_p64(v21, ls23modp);
+
+  v01c = add_p64x2(v01c, v01d);
+  v01a = add_p64x2(v01a, v01b);
+  v01e = add_p64x2(v01e, v01g);
+  v01a = add_p64x2(v01a, v01c);
+  v01 = add_p64x2(v01a, v01e);
+
+  poly64x2_t vx1 = add_p64x2(v01, v21);
+
+  // Barret reduction
+  poly64x2_t vb = (poly64x2_t)vmull_force_high_p64(vx1, lsamodp_divp);
+  vb = add_p64x2(vb, vx1);
+  poly64x2_t v0x = (poly64x2_t)vmull_force_high_p64(vb, ls11modp);
+  v0x = add_p64x2(v0x, v01);
+  *crc = (uint64_t)(v0x[0]);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
index 5877bfa..11c2741 100644
--- a/src/UpperPHY/CRC/highway/crc_common.hpp
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -1,13 +1,11 @@
 /*
     Arm RAN Acceleration Library
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-   Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
-   Cambridge Consultants Project Reference P5851
 */
 #pragma once
 
-#include <hwy/highway.h>
 #include "utils/hwy_types.hpp"
+#include <hwy/highway.h>
 
 namespace hn = hwy::HWY_NAMESPACE;
 
-- 
GitLab


From d07fe9d8dde107059276e70d9ece6f8425037bee Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Tue, 21 Jan 2025 15:52:08 +0000
Subject: [PATCH 18/20] Port Turbo Encoder & Decoder

Due to overhead from load masking due to needing to use fixed sized vectors
Highway's SVE & SVE2 implementations are disabled for the decoder compilation
unit.
---
 armral_hwy.cmake                              |  38 +-
 src/UpperPHY/Turbo/arm_turbo_decoder.cpp      |   1 -
 src/UpperPHY/Turbo/arm_turbo_decoder.hpp      |   5 +
 .../Turbo/highway/arm_turbo_decoder.hpp       | 547 ++++++++++++++++++
 .../Turbo/highway/arm_turbo_rate_recovery.cpp | 247 ++++++++
 src/utils/hwy_types.hpp                       |  10 +-
 test/UpperPHY/Turbo/turbo_test_data.hpp       |  16 +
 7 files changed, 848 insertions(+), 16 deletions(-)
 create mode 100644 src/UpperPHY/Turbo/highway/arm_turbo_decoder.hpp
 create mode 100644 src/UpperPHY/Turbo/highway/arm_turbo_rate_recovery.cpp

diff --git a/armral_hwy.cmake b/armral_hwy.cmake
index 12e0274..5aae0b1 100644
--- a/armral_hwy.cmake
+++ b/armral_hwy.cmake
@@ -73,6 +73,16 @@ set_property(
   APPEND
   PROPERTY COMPILE_DEFINITIONS
            HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
+# The Turbo decoder implementation does not support scalable vectors and is
+# memory access heavy. The overhead of implicit masking when using fixed 128-bit
+# vectors causes a ~60% overhead.
+set_property(
+  SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+           HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
+
 # GCC recognizes the usage of XOR as an associative operation, then it tries to
 # optimize the operation tree in its tree-reassoc pass, but it actually makes
 # the performance much worse. Disabling the tree-assoc pass means that the
@@ -168,10 +178,10 @@ set(ARMRAL_LIB_SOURCES
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/highway/arm_turbo_rate_recovery.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp
 )
 
@@ -425,11 +435,11 @@ if(BUILD_TESTING)
 #                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
 #  add_armral_test(polar_subchannel_interleave
 #                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-#  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
-#  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
-#  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
-#  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
-#  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp)
+  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
 #  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
 #
 #  add_armral_bench(
@@ -627,12 +637,12 @@ if(BUILD_TESTING)
 #                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
 #  add_armral_bench(polar_subchannel_interleave
 #                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
-#  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
-#  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
-#  add_armral_bench(turbo_rate_matching
-#                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
-#  add_armral_bench(turbo_rate_recovery
-#                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_bench(turbo_rate_matching
+                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_bench(turbo_rate_recovery
+                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
 #  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
 # cmake-format: on
 endif()
diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
index cc380fd..944a17a 100644
--- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
@@ -3,7 +3,6 @@
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "intrinsics.h"
 #include "turbo_code.hpp"
 #include "turbo_tables.hpp"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp b/src/UpperPHY/Turbo/arm_turbo_decoder.hpp
index f2e3012..775fcb3 100644
--- a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp
+++ b/src/UpperPHY/Turbo/arm_turbo_decoder.hpp
@@ -4,6 +4,10 @@
 */
 #pragma once
 
+#ifdef ARMRAL_ARCH_HWY
+#include "highway/arm_turbo_decoder.hpp"
+#else
+#include "intrinsics.h"
 #include <cmath>
 
 namespace {
@@ -482,3 +486,4 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
   // Make a hard decision based on the final LLRs
   turbo_llrs_to_bits(k, l1_uky.get(), dst);
 }
+#endif
diff --git a/src/UpperPHY/Turbo/highway/arm_turbo_decoder.hpp b/src/UpperPHY/Turbo/highway/arm_turbo_decoder.hpp
new file mode 100644
index 0000000..5d2e718
--- /dev/null
+++ b/src/UpperPHY/Turbo/highway/arm_turbo_decoder.hpp
@@ -0,0 +1,547 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+#pragma once
+
+#include "utils/hwy_types.hpp"
+#include <cmath>
+#include <hwy/highway.h>
+namespace hn = hwy::HWY_NAMESPACE;
+
+namespace {
+
+struct Vec_i16x8_val {
+  int16_t val[8];
+};
+
+struct Vec_i16x4_val {
+  int16_t val[4];
+};
+
+struct Vec_i16x4x8_val {
+  Vec_i16x4_val val[8];
+};
+
+// With Turbo codes n (=k) is always divisible by 8 so we
+// do not have to worry about tail bits
+HWY_FORCED_INLINE void turbo_llrs_to_bits(uint32_t n, const Vec_i16x8_val *llr,
+                                          uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  const Vec_i16x8 ones =
+      hn::Dup128VecFromValues(di16x8, 128, 64, 32, 16, 8, 4, 2, 1);
+
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    // The first bit to write in the byte is the most significant
+    Vec_i16x8 llrs = hn::LoadU(di16x8, llr[i].val);
+    Vec_u16x8 mask =
+        hn::BitCast(du16x8, hn::IfNegativeThenElseZero(llrs, ones));
+    data_out[i] = (uint8_t)hn::ReduceSum(du16x8, mask);
+  }
+}
+
+// Take the input int8_t LLRs and convert them to int16x8_ts
+HWY_FORCED_INLINE void convert_llrs(uint32_t k, const int8_t *llrs,
+                                    Vec_i16x8_val *llrs_i16) {
+  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
+  for (uint32_t i = 0, j = 0; i < k; i += 8, j++) {
+    Vec_i16x8 res = hn::PromoteTo(di16x8, hn::LoadU(di16x8_di8x16, &llrs[i]));
+    hn::StoreU(res, di16x8, llrs_i16[j].val);
+  }
+}
+
+// Update the extrinsic information output from the decoding stage
+// based on the computed LLRs, the old extrinsic information and the input
+HWY_FORCED_INLINE void update_extrinsic(uint32_t len, const Vec_i16x8_val *llr,
+                                        Vec_i16x8_val *extrinsic,
+                                        const Vec_i16x8_val *input) {
+  for (uint32_t i = 0; i < len; i++) {
+    const Vec_i16x8 llr_v = hn::LoadU(di16x8, llr[i].val);
+    Vec_i16x8 extrinsic_v = hn::LoadU(di16x8, extrinsic[i].val);
+    const Vec_i16x8 input_v = hn::LoadU(di16x8, input[i].val);
+
+    extrinsic_v =
+        hn::SaturatedSub(hn::SaturatedSub(llr_v, extrinsic_v), input_v);
+    hn::StoreU(extrinsic_v, di16x8, extrinsic[i].val);
+  }
+}
+
+// Calculate the trellis termination values. These are independent of the
+// extrinsic information and so can be done once without needing to be updated
+// on every iteration.
+Vec_i16x8 trellis_termination(const Vec_i16x8_val *sys,
+                              const Vec_i16x8_val *par, uint32_t k8,
+                              Vec_i16x8 l_c) {
+  // We handle the gammas for the trellis termination bits separately
+  // as the state transitions are different. The x_{kl} are never 1
+  // here, because we always use inputs of 0 to drive the trellis back
+  // to state 0 in the encoder, so we only need to consider a smaller
+  // number of state transitions. We also do not have any extrinsic
+  // information. Because some of the gamma terms will always be
+  // -INFINITY (specifically indices [1] and [3]) we can forgo adding
+  // to them to beta or taking the max with them, compared with when
+  // we calculate beta in the main calculations. As above, we assume
+  // that the channel reliability parameter l_c/2 = 1.
+  Vec_i16x8 sys_v = hn::LoadU(di16x8, sys[k8].val);
+  Vec_i16x8 par_v = hn::LoadU(di16x8, par[k8].val);
+  Vec_i16x8 pdf_00 = hn::SaturatedAdd(sys_v, par_v);
+  Vec_i16x8 pdf_01 = hn::SaturatedSub(sys_v, par_v);
+
+  Vec_i16x8 g0102 = hn::InterleaveWholeLower(hn::BroadcastLane<1>(pdf_00),
+                                             hn::BroadcastLane<1>(pdf_01));
+
+  Vec_i16x8 b01_part = hn::InterleaveWholeLower(hn::BroadcastLane<2>(pdf_00),
+                                                hn::BroadcastLane<2>(pdf_01));
+  Vec_i16x8 b01 = hn::InterleaveWholeLower(b01_part, b01_part);
+
+  Vec_i16x8 beta_term = hn::SaturatedAdd(g0102, b01);
+
+  int16_t g_idx[8] = {0, 8, 0, 8, 8, 0, 8, 0};
+  Vec_i16x8 g = hn::TwoTablesLookupLanes(pdf_00, pdf_01,
+                                         hn::SetTableIndices(di16x8, g_idx));
+
+  Vec_i16x8 b0123 = hn::InterleaveWholeLower(beta_term, beta_term);
+
+  return hn::SaturatedAdd(g, b0123);
+}
+
+// A single max-log-MAP decoder that works on an array of systematic bits (sys),
+// an array of parity bits (par), and an array of extrinsic values from a
+// previous decoding stage (extrinsic)
+void decode_step(const Vec_i16x8_val *sys, const Vec_i16x8_val *par,
+                 const Vec_i16x8_val *extrinsic, uint32_t k8,
+                 Vec_i16x8_val *llr, Vec_i16x8_val *alpha, Vec_i16x8 beta_tail,
+                 Vec_i16x4x8_val *pdf4, Vec_i16x8 l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
+
+  // Start by computing the non-zero conditional state transition probabilities
+  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
+  // an AWGN channel (ignoring extrinsic information in l_uk):
+  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
+  // Here there are only 2 possible state transitions into each state
+  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
+  // terms.
+  for (uint32_t i = 0; i < k8; i++) {
+    // The x_{kl} values are the actual systematic and parity values that
+    // would result from the encoder having transited from state s' to s.
+    // They can only ever be either 0 or 1 so we precompute the four possible
+    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
+    // that these 0s and 1s have to be converted to 1s and -1s to match the
+    // values in y.
+    //
+    // The y_{kl} values are the observed systematic and parity inputs.
+    // These have potentially been perturbed by noise on the channel.
+    //
+    // Although each of the 8 states of the encoder has in theory 8
+    // predecessor states, the encoder's structure means that not all state
+    // transitions are possible. Each state actually only has 2 predecessor
+    // states so we only have to compute 16 non-zero values for each input
+    // LLR.
+    //
+    // We calculate the PDF of the state transition probability on the
+    // assumption that we are operating on an AWGN channel:
+    //     PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
+    // where l_uk is the extrinsic information, y1 is the systematic
+    // input, and y2 is the parity input. We assume the channel
+    // reliability, l_c, is set such that l_c/2 = 1 and therefore omit
+    // it from the calculation. See arm_turbo_decoder.cpp for
+    // justification.
+
+    Vec_i16x8 extrinsic_div_2 =
+        hn::ShiftRight<1>(hn::LoadU(di16x8, extrinsic[i].val));
+    Vec_i16x8 sys_v = hn::LoadU(di16x8, sys[i].val);
+    Vec_i16x8 par_v = hn::LoadU(di16x8, par[i].val);
+
+    Vec_i16x8 term = hn::SaturatedAdd(extrinsic_div_2, sys_v);
+    Vec_i16x8 pdf_00 = hn::SaturatedAdd(term, par_v);
+    Vec_i16x8 pdf_10 = hn::SaturatedSub(par_v, term);
+    Vec_i16x8 pdf_01 = hn::SaturatedSub(term, par_v);
+    Vec_i16x8 pdf_11 = hn::SaturatedSub(hn::SaturatedNeg(term), par_v);
+
+    // There is considerable duplication in the values we could store. For
+    // example, for a single state the 16 gamma values are:
+    //
+    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
+    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
+    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
+    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
+    //
+    // We therefore choose to store the 4 unique pdf values (using
+    // st4) as this allows us to access the pdf values contiguously in
+    // the calculations needed for the alpha and beta values.
+    hn::StoreInterleaved4(pdf_00, pdf_10, pdf_01, pdf_11, di16x8,
+                          (int16_t *)pdf4[i].val);
+
+    // Accumulate the state transition probabilities forwards through
+    // the state transition trellis starting from the known encoder
+    // start state 0.
+
+    const int8_t idx_0123321[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+                                    6, 7, 4, 5, 2, 3, 0, 1};
+
+    const int8_t idx_32100123[16] = {6, 7, 4, 5, 2, 3, 0, 1,
+                                     0, 1, 2, 3, 4, 5, 6, 7};
+
+    for (uint32_t j = 0; j < 8; j++) {
+      k_idx = 8 * i + j;
+      kp1_idx = k_idx + 1;
+      Vec_i16x8 pdf4_v =
+          hn::ZeroExtendVector(di16x8, hn::LoadU(di16x4, pdf4[i].val[j].val));
+
+      // We need  g02 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
+      //                 gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0],
+      //                 gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
+      //                 gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
+      Vec_i16x8 g02 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(hn::BitCast(di8x16, pdf4_v),
+                               hn::SetTableIndices(di8x16, idx_0123321)));
+
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2],
+      //                 alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      Vec_i16x8 alpha_v = hn::LoadU(di16x8, alpha[k_idx].val);
+      Vec_i16x8 a02 = hn::ConcatEven(di16x8, alpha_v, alpha_v);
+      Vec_i16x8 left = hn::SaturatedAdd(g02, a02);
+
+      // This is g02 with the 64-bit elements swapped
+      Vec_i16x8 g20 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(hn::BitCast(di8x16, pdf4_v),
+                               hn::SetTableIndices(di8x16, idx_32100123)));
+
+      // We need  a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3],
+      //                 alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      Vec_i16x8 a13 = hn::ConcatOdd(di16x8, alpha_v, alpha_v);
+      Vec_i16x8 right = hn::SaturatedAdd(g20, a13);
+
+      hn::StoreU(hn::Max(left, right), di16x8, alpha[kp1_idx].val);
+
+      // Normalize alpha
+      if (j % 4 == 0) {
+        Vec_i16x8 alpha0_v = hn::Set(di16x8, alpha[kp1_idx].val[0]);
+        Vec_i16x8 alpha_kp1_v = hn::LoadU(di16x8, alpha[kp1_idx].val);
+        hn::StoreU(hn::SaturatedSub(alpha_kp1_v, alpha0_v), di16x8,
+                   alpha[kp1_idx].val);
+      }
+    }
+  }
+
+  // Accumulate the state transition probabilities backwards through the state
+  // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1.
+
+  const uint8_t idx_even_odd[16] = {0, 1, 4, 5, 8,  9,  12, 13,
+                                    2, 3, 6, 7, 10, 11, 14, 15};
+
+  const uint8_t idx_05274163[16] = {0, 1, 10, 11, 4,  5,  14, 15,
+                                    8, 9, 2,  3,  12, 13, 6,  7};
+
+  const uint8_t idx_0220[16] = {0, 1, 4, 5, 4, 5, 0, 1, 0, 1, 4, 5, 4, 5, 0, 1};
+
+  const uint8_t idx_3113[16] = {6, 7, 2, 3, 2, 3, 6, 7, 6, 7, 2, 3, 2, 3, 6, 7};
+
+  const uint8_t idx_0213[16] = {0, 1, 6, 7, 2, 3, 4, 5, 4, 5, 2, 3, 6, 7, 0, 1};
+
+  const uint8_t idx_1302[16] = {6, 7, 0, 1, 4, 5, 2, 3, 2, 3, 4, 5, 0, 1, 6, 7};
+
+  Vec_i16x8 beta_kp1 = beta_tail;
+
+  for (int32_t i = k8 - 1; i >= 0; i--) {
+    int16_t prob_0[hn::Lanes(di16x8)];
+    int16_t prob_1[hn::Lanes(di16x8)];
+
+    for (int32_t j = 7; j >= 0; j--) {
+      k_idx = 8 * i + j;
+
+      // Normalize beta
+      if (j % 4 == 0) {
+        Vec_i16x8 beta0 = hn::BroadcastLane<0>(beta_kp1);
+        beta_kp1 = hn::SaturatedSub(beta_kp1, beta0);
+      }
+
+      Vec_i16x4 pdf4_v = hn::LoadU(di16x4, pdf4[i].val[j].val);
+      Vec_u8x16 pdf8_u8 =
+          hn::BitCast(du8x16, hn::Combine(di16x8, pdf4_v, pdf4_v));
+
+      // g0213 = {pdf[0], pdf[3], pdf[1], pdf[2],
+      //          pdf[2], pdf[1], pdf[3], pdf[0]};
+      Vec_i16x8 g0213 = hn::BitCast(
+          di16x8, (hn::TableLookupLanes(
+                      pdf8_u8, hn::SetTableIndices(du8x16, idx_0213))));
+
+      // Reverse 32-bit elements in g0213
+      // g1302 = {pdf[3], pdf[0], pdf[2], pdf[1],
+      //          pdf[1], pdf[2], pdf[0], pdf[3]};
+      Vec_i16x8 g1302 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(pdf8_u8, hn::SetTableIndices(du8x16, idx_1302)));
+
+      // b0123 = {beta_kp1[0], beta_kp1[0], beta_kp1[1], beta_kp1[1],
+      //          beta_kp1[2], beta_kp1[2], beta_kp1[3], beta_kp1[3]};
+      // b4567 = {beta_kp1[4], beta_kp1[4], beta_kp1[5], beta_kp1[5],
+      //          beta_kp1[6], beta_kp1[6], beta_kp1[7], beta_kp1[7]};
+      Vec_i16x8 b0123 = hn::InterleaveWholeLower(di16x8, beta_kp1, beta_kp1);
+      Vec_i16x8 b4567 = hn::InterleaveUpper(di16x8, beta_kp1, beta_kp1);
+
+      Vec_i16x8 left = hn::SaturatedAdd(g0213, b0123);
+      Vec_i16x8 right = hn::SaturatedAdd(g1302, b4567);
+
+      Vec_i16x8 beta_k = hn::Max(left, right);
+
+      // a0213 = {alpha[k_idx][0], alpha[k_idx][2], alpha[k_idx][4], alpha[k_idx][6],
+      //          alpha[k_idx][1], alpha[k_idx][3], alpha[k_idx][5], alpha[k_idx][7]};
+      Vec_i16x8 alpha_v = hn::LoadU(di16x8, alpha[k_idx].val);
+      Vec_i16x8 a0213 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(hn::BitCast(du8x16, alpha_v),
+                               hn::SetTableIndices(du8x16, idx_even_odd)));
+
+      // b0213_1302 = {beta_kp1[0], beta_kp1[5], beta_kp1[2], beta_kp1[7],
+      //               beta_kp1[4], beta_kp1[1], beta_kp1[6], beta_kp1[3]};
+      Vec_i16x8 b0213_1302 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(hn::BitCast(du8x16, beta_kp1),
+                               hn::SetTableIndices(du8x16, idx_05274163)));
+      // Swap upper and lower half
+      Vec_i16x8 b1302_0213 = hn::BitCast(
+          di16x8, hn::Reverse2(di64x2, hn::BitCast(di64x2, b0213_1302)));
+
+      // g0101 = {pdf[0], pdf[2], pdf[2], pdf[0]};
+      Vec_i16x8 g0101 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(pdf8_u8, hn::SetTableIndices(du8x16, idx_0220)));
+      Vec_i16x8 left_right_0 =
+          hn::SaturatedAdd(hn::SaturatedAdd(a0213, b0213_1302), g0101);
+
+      // g1010 = {pdf[3], pdf[1], pdf[1], pdf[3]};
+      Vec_i16x8 g1010 = hn::BitCast(
+          di16x8,
+          hn::TableLookupLanes(pdf8_u8, hn::SetTableIndices(du8x16, idx_3113)));
+      Vec_i16x8 left_right_1 =
+          hn::SaturatedAdd(hn::SaturatedAdd(a0213, b1302_0213), g1010);
+
+      prob_0[j] = hn::ReduceMax(di16x8, left_right_0);
+      prob_1[j] = hn::ReduceMax(di16x8, left_right_1);
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
+    }
+
+    // Calculate the LLRs
+    Vec_i16x8 prob_0_v = hn::LoadU(di16x8, prob_0);
+    Vec_i16x8 prob_1_v = hn::LoadU(di16x8, prob_1);
+    hn::StoreU(hn::SaturatedSub(prob_0_v, prob_1_v), di16x8, llr[i].val);
+  }
+}
+
+} // namespace
+
+// The template parameter allows us to disable checking for convergence (and
+// thus terminating the iterations early) so we always run a fixed number of
+// iterations in our benchmarking
+template<bool check_convergence, typename Allocator>
+void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
+                                 const int8_t *itl, uint32_t k, uint8_t *dst,
+                                 float32_t l_c, uint32_t max_iter,
+                                 uint16_t *perm_idxs, Allocator &allocator) {
+  // This implements multiple steps of the max-log-MAP algorithm,
+  // which is an approximation to the MAP (BCJR) algorithm. It returns
+  // a hard decision rather than raw LLRs.
+
+  // We will be working with Vec_i16x8, so work out how many of these
+  // will be needed to store k int16_ts. k is always a multiple of 8,
+  // so no need to worry about remainders.
+  uint32_t k8 = k >> 3;
+
+  auto sys_s16 = allocate_uninitialized<Vec_i16x8_val>(allocator, k8 + 1);
+  auto par_s16 = allocate_uninitialized<Vec_i16x8_val>(allocator, k8 + 1);
+  auto itl_s16 = allocate_uninitialized<Vec_i16x8_val>(allocator, k8 + 1);
+
+  auto perm_sys = allocate_uninitialized<Vec_i16x8_val>(allocator, k8 + 1);
+
+  // Allocate space to hold the extrinsic and permuted extrinsic information
+  // to be passed between the two decoders. Extrinsic is initially set to 0.
+  auto extrinsic = allocate_zeroed<Vec_i16x8_val>(allocator, k8);
+  auto perm_extrinsic = allocate_zeroed<Vec_i16x8_val>(allocator, k8);
+
+  // Allocate space for log likelihood ratios from both stages of decoding
+  auto l1_uky = allocate_uninitialized<Vec_i16x8_val>(allocator, k8);
+  auto l2_uky = allocate_uninitialized<Vec_i16x8_val>(allocator, k8);
+  auto prev_l2_uky = allocate_zeroed<Vec_i16x8_val>(allocator, k8);
+
+  // Allocate space to hold alpha and gamma
+  // alpha stores the forward-accumulated state probabilities for each decoded
+  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
+  // plus the starting condition
+  auto alpha = allocate_uninitialized<Vec_i16x8_val>(allocator, 8 * k8 + 1);
+
+  // gamma stores the conditional state transition probabilities for each of the
+  // k+3 bits to decode
+  auto gamma = allocate_uninitialized<Vec_i16x4x8_val>(allocator, k8);
+
+  // Get the permutation vector for the input value of k.
+  // declare unique_ptr here to keep the allocated memory's scope outside the
+  // else block
+  unique_ptr<Allocator, perm_idx_lookup> perm_lookup_unique;
+  perm_idx_lookup *perm_lookup = nullptr;
+  // Find the index into the array of parameter arrays corresponding
+  // to the current k. Subtract 40 because k=40 is the lowest value.
+  uint32_t param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
+  if (perm_idxs != NULL) {
+    // NOTE: All allocations done.
+    if constexpr (Allocator::is_counting) {
+      return;
+    }
+    perm_lookup = (perm_idx_lookup *)perm_idxs +
+                  armral::turbo::perm_lookup_offset[param_idx];
+  } else {
+    perm_lookup_unique = allocate_uninitialized<perm_idx_lookup>(allocator, k);
+
+    // NOTE: All allocations done.
+    if constexpr (Allocator::is_counting) {
+      return;
+    }
+
+    perm_lookup = perm_lookup_unique.get();
+
+    // Generate the permutation vector for the input value of k.
+    armral::turbo::k_perm_idx_init(k, param_idx, perm_lookup);
+  }
+
+  // Convert our LLRs from int8_ts into int16_ts
+  convert_llrs(k, sys, sys_s16.get());
+  convert_llrs(k, par, par_s16.get());
+  convert_llrs(k, itl, itl_s16.get());
+
+  // Unperturb the trellis termination bits. They are transmitted as:
+  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
+  // Z'2
+  // but need to appended to the inputs as:
+  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
+  // Z'2
+  // We append to the systematic (X), the parity (Z) and the interleaved parity
+  // (Z') values here, and to the interleaved systematic values (X') further
+  // down.
+  sys_s16[k8].val[0] = (int16_t)sys[k];
+  sys_s16[k8].val[1] = (int16_t)itl[k];
+  sys_s16[k8].val[2] = (int16_t)par[k + 1];
+
+  par_s16[k8].val[0] = (int16_t)par[k];
+  par_s16[k8].val[1] = (int16_t)sys[k + 1];
+  par_s16[k8].val[2] = (int16_t)itl[k + 1];
+
+  itl_s16[k8].val[0] = (int16_t)par[k + 2];
+  itl_s16[k8].val[1] = (int16_t)sys[k + 3];
+  itl_s16[k8].val[2] = (int16_t)itl[k + 3];
+
+  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later
+  // TODO: Does this ever actually get used
+  Vec_i16x8 channel_reliability = Set(di16x8, (int16_t)l_c / 2);
+
+  // Create a permuted version of the systematic output for use
+  // with the second decoder
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      perm_sys[i].val[j] = (int16_t)sys[perm_lookup[(i * 8) + j].perm_idx];
+    }
+  }
+  perm_sys[k8].val[0] = (int16_t)sys[k + 2];
+  perm_sys[k8].val[1] = (int16_t)itl[k + 2];
+  perm_sys[k8].val[2] = (int16_t)par[k + 3];
+
+  // Initialize alpha
+  hn::Store(hn::Set(di16x8, std::numeric_limits<int16_t>::min()), di16x8,
+            alpha[0].val);
+  alpha[0].val[0] = 0;
+
+  // Calculate the trellis termination state transition probabilities, which
+  // do not require any extrinsic information
+  Vec_i16x8 beta_tail = trellis_termination(sys_s16.get(), par_s16.get(), k8,
+                                            channel_reliability);
+  Vec_i16x8 perm_beta_tail = trellis_termination(perm_sys.get(), itl_s16.get(),
+                                                 k8, channel_reliability);
+
+  // Initialize the number of iterations
+  uint32_t num_iter = 0;
+
+  while (num_iter < max_iter) {
+    // Run the first decoder step
+    decode_step(sys_s16.get(), par_s16.get(), extrinsic.get(), k8, l1_uky.get(),
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
+
+    // Compute the new extrinsic information to pass into the second decoder
+    update_extrinsic(k8, l1_uky.get(), extrinsic.get(), sys_s16.get());
+
+    // Need to unpermute extrinsic to match input to second decoder
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        perm_extrinsic[i].val[j] = extrinsic[perm_lookup[i * 8 + j].vec_idx]
+                                       .val[perm_lookup[i * 8 + j].vec_lane];
+      }
+    }
+
+    // Run the second decoder step
+    decode_step(perm_sys.get(), itl_s16.get(), perm_extrinsic.get(), k8,
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
+
+    // Compute the new extrinsic information to pass back into the first encoder
+    update_extrinsic(k8, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
+
+    // But need to unpermute extrinsic first
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        extrinsic[perm_lookup[i * 8 + j].vec_idx]
+            .val[perm_lookup[i * 8 + j].vec_lane] = perm_extrinsic[i].val[j];
+      }
+    }
+
+    // Compare this iteration's results with those from the previous iteration
+    int16_t max_abs_diff = 0;
+    for (uint32_t i = 0; i < k8; i++) {
+      Vec_i16x8 l2_uky_v = hn::LoadU(di16x8, l2_uky[i].val);
+      Vec_i16x8 prev_l2_uky_v = hn::LoadU(di16x8, prev_l2_uky[i].val);
+      int16_t abs_diff = hn::ReduceMax(
+          di16x8, hn::SaturatedAbs(hn::SaturatedSub(l2_uky_v, prev_l2_uky_v)));
+      if (abs_diff > max_abs_diff) {
+        max_abs_diff = abs_diff;
+      }
+    }
+
+    // If we've converged, finish decoding
+    if constexpr (check_convergence) {
+      if (max_abs_diff == 0) {
+        break;
+      }
+    }
+
+    // Store the current "final" LLRs to use in convergence checking next
+    // iteration
+    for (uint32_t i = 0; i < k8; i++) {
+      prev_l2_uky[i] = l2_uky[i];
+    }
+
+    num_iter++;
+  }
+
+  // Return unpermuted final output from second encoder
+  // Rather than allocate another new vector, copy into l1_uky and return that
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      l1_uky[perm_lookup[i * 8 + j].vec_idx]
+          .val[perm_lookup[i * 8 + j].vec_lane] = l2_uky[i].val[j];
+    }
+  }
+
+  // Make a hard decision based on the final LLRs
+  turbo_llrs_to_bits(k, l1_uky.get(), dst);
+}
diff --git a/src/UpperPHY/Turbo/highway/arm_turbo_rate_recovery.cpp b/src/UpperPHY/Turbo/highway/arm_turbo_rate_recovery.cpp
new file mode 100644
index 0000000..19f3606
--- /dev/null
+++ b/src/UpperPHY/Turbo/highway/arm_turbo_rate_recovery.cpp
@@ -0,0 +1,247 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2024-2025
+    Cambridge Consultants Project Reference P5851
+*/
+
+#include "armral.h"
+
+#include "../turbo_tables.hpp"
+#include "utils/allocators.hpp"
+#include "utils/hwy_types.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+namespace armral::turbo {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+HWY_FORCED_INLINE int8_t sat_add_8(int8_t a, int8_t b) {
+  int16_t partial_res = (uint16_t)a + (uint16_t)b;
+  if (partial_res > INT8_MAX) {
+    return INT8_MAX;
+  }
+  if (partial_res < INT8_MIN) {
+    return INT8_MIN;
+  }
+  return partial_res;
+}
+
+struct dummy_bits_work_buffers {
+  int8_t *dummy0;
+  int8_t *dummy1;
+  int8_t *dummy2;
+};
+
+static void generate_dummy_bits_tracking(uint32_t d, uint32_t rtc,
+                                         int8_t *dummy,
+                                         dummy_bits_work_buffers work_buffers) {
+  // The dummy bits are the first nd bits of each of the three streams of data
+  // in the turbo code. Where these are in the overall input data stream is
+  // determined in a manner similar to the encoding, described in section
+  // 5.1.4.1.1 in 3GPP specification 36.212.
+  const uint32_t kpi = rtc * armral::turbo::ctc;
+  const uint32_t nd = kpi - d;
+
+  // Tag nd elements as dummy bits.
+  // dummy0 and dummy1 are permuted and transposed.
+  for (uint32_t i = 0; i < nd; ++i) {
+    work_buffers.dummy0[armral::turbo::p[i] * rtc] = 1;
+    work_buffers.dummy1[armral::turbo::p[i] * rtc] = 1;
+  }
+  // Permutation for dummy2
+  for (uint32_t i = 0; i < kpi; ++i) {
+    // TODO: We don't need to go through all of kpi here. We should be able to
+    //       identify where each of the nd < crc = 32 bits goes.
+    uint32_t pi =
+        (armral::turbo::p[i / rtc] + armral::turbo::ctc * (i % rtc) + 1) % kpi;
+    if (pi < nd) {
+      work_buffers.dummy2[i] = 1;
+    }
+  }
+
+  // bit collection step for dummy
+  memcpy((void *)dummy, (const void *)work_buffers.dummy0,
+         sizeof(int8_t) * kpi);
+  for (uint32_t i = 0; i < kpi; ++i) {
+    dummy[kpi + 2 * i] = work_buffers.dummy1[i];
+    dummy[kpi + 2 * i + 1] = work_buffers.dummy2[i];
+  }
+}
+
+// Undo the turbo rate matching subblock interleaver defined in
+// TS 36.212 Section 5.1.4.1.1
+struct subblock_deinterleave_work_buffers {
+  int8_t *y0;
+  int8_t *y1;
+  int8_t *y2;
+};
+
+static void
+subblock_deinterleave(uint32_t d, uint32_t rtc, const int8_t *v0,
+                      const int8_t *v1, const int8_t *v2, int8_t *d0,
+                      int8_t *d1, int8_t *d2,
+                      subblock_deinterleave_work_buffers work_buffers) {
+
+  const uint32_t kpi = rtc * armral::turbo::ctc;
+  const uint32_t nd = kpi - d;
+
+  // Reverse permutation and transpose for d^(0)_k and d^(1)_k
+  for (uint32_t i = 0; i < rtc; ++i) {
+    for (uint32_t j = 0; j < armral::turbo::ctc; ++j) {
+      uint32_t k = j + i * armral::turbo::ctc;
+      work_buffers.y0[k] = v0[i + armral::turbo::p[j] * rtc];
+      work_buffers.y1[k] = v1[i + armral::turbo::p[j] * rtc];
+    }
+  }
+
+  // Reverse permutation for d^(2)_k
+  for (uint32_t i = 0; i < kpi; ++i) {
+    uint32_t pi =
+        (armral::turbo::p[i / rtc] + armral::turbo::ctc * (i % rtc) + 1) % kpi;
+    work_buffers.y2[pi] = v2[i];
+  }
+
+  // Ignore nd elements as they are dummy bits
+  // d0, d1 and d2 may already contain LLRs so we sum into them
+  // rather than overwriting
+  for (uint32_t i = 0; i < d; i += hn::Lanes(di8x16)) {
+    size_t batch = HWY_MIN(d - i, hn::Lanes(di8x16));
+    Mask_i8x16 mask = hn::FirstN(di8x16, batch);
+
+    Vec_i8x16 d0_vec = no_sanitize::MaskedLoad(mask, di8x16, d0 + i);
+    Vec_i8x16 y0_vec =
+        no_sanitize::MaskedLoad(mask, di8x16, &work_buffers.y0[nd + i]);
+    hn::StoreN(hn::SaturatedAdd(d0_vec, y0_vec), di8x16, d0 + i, batch);
+
+    Vec_i8x16 d1_vec = no_sanitize::MaskedLoad(mask, di8x16, d1 + i);
+    Vec_i8x16 y1_vec =
+        no_sanitize::MaskedLoad(mask, di8x16, &work_buffers.y1[nd + i]);
+    hn::StoreN(hn::SaturatedAdd(d1_vec, y1_vec), di8x16, d1 + i, batch);
+
+    Vec_i8x16 d2_vec = no_sanitize::MaskedLoad(mask, di8x16, d2 + i);
+    Vec_i8x16 y2_vec =
+        no_sanitize::MaskedLoad(mask, di8x16, &work_buffers.y2[nd + i]);
+    hn::StoreN(hn::SaturatedAdd(d2_vec, y2_vec), di8x16, d2 + i, batch);
+  }
+}
+
+// Undo the turbo rate matching bit collection defined in
+// TS 36.212 Section 5.1.4.1.2
+static void bit_decollection(uint32_t kpi, const int8_t *w, int8_t *v0,
+                             int8_t *v1, int8_t *v2) {
+  // w_k = v^0_k for k = [0, kpi - 1]
+  memcpy((void *)v0, (const void *)w, sizeof(int8_t) * kpi);
+
+  // v^1_k = w_{kpi + 2k    },
+  // v^2_k = w_{kpi + 2k + 1} for k = [0, kpi - 1]
+  for (uint32_t k = 0; k < kpi; ++k) {
+    v1[k] = w[kpi + 2 * k];
+    v2[k] = w[kpi + 2 * k + 1];
+  }
+}
+
+// Undo the turbo rate matching bit selection defined in
+// TS 36.212 Section 5.1.4.1.2
+static void bit_deselection(uint32_t ncb, uint32_t k0, uint32_t e,
+                            const int8_t *ek, const int8_t *dummy, int8_t *w) {
+  uint32_t k = 0;
+  uint32_t j = 0;
+  while (k < e) {
+    uint32_t i = (k0 + j) % ncb;
+    bool not_dummy = dummy[i] != 1;
+    if (not_dummy) {
+      w[i] = sat_add_8(w[i], ek[k]);
+      k++;
+    }
+    j++;
+  }
+}
+
+template<typename Allocator>
+armral_status rate_recovery(uint32_t d, uint32_t e, uint32_t rv,
+                            const int8_t *src, int8_t *dst0, int8_t *dst1,
+                            int8_t *dst2, Allocator &allocator) {
+  assert(d > 0);
+  assert(e > 0);
+  assert(rv <= 3);
+
+  // The minimum number of rows which gives rtc * ctc >= d.
+  const uint32_t rtc = (d + armral::turbo::ctc - 1) / armral::turbo::ctc;
+  const uint32_t kpi = rtc * armral::turbo::ctc;
+  const uint32_t kw = 3 * kpi;
+
+  auto dummy = allocate_zeroed<int8_t>(allocator, kpi * 3);
+  auto dummy0 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto dummy1 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto dummy2 = allocate_zeroed<int8_t>(allocator, kpi);
+
+  auto w = allocate_zeroed<int8_t>(allocator, kpi * 3);
+
+  auto v0 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto v1 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto v2 = allocate_zeroed<int8_t>(allocator, kpi);
+
+  auto y0 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto y1 = allocate_zeroed<int8_t>(allocator, kpi);
+  auto y2 = allocate_zeroed<int8_t>(allocator, kpi);
+
+  if constexpr (Allocator::is_counting) {
+    return ARMRAL_SUCCESS;
+  }
+
+  // Assume N_cb = k_w.
+  const uint32_t ncb = kw;
+
+  // Calculate k0 with the assumption N_cb = k_w, as per section 5.1.4.1.2 of
+  // 3GPP specification 36.212. We can do this, as we assume that we only ever
+  // deal with a single code block per transport block.
+  // k0 = rtc * (2 * N_cb/(8 * rtc) * rv + 2),
+  // where
+  // N_cb = kw
+  //      = 3 * ctc * rtc
+  //      = 3 * 32 * rtc
+  const uint32_t k0 = rtc * (24 * rv + 2);
+
+  generate_dummy_bits_tracking(d, rtc, dummy.get(),
+                               {dummy0.get(), dummy1.get(), dummy2.get()});
+
+  bit_deselection(ncb, k0, e, src, dummy.get(), w.get());
+
+  bit_decollection(kpi, w.get(), v0.get(), v1.get(), v2.get());
+
+  subblock_deinterleave(d, rtc, v0.get(), v1.get(), v2.get(), dst0, dst1, dst2,
+                        {y0.get(), y1.get(), y2.get()});
+
+  return ARMRAL_SUCCESS;
+}
+
+} // namespace armral::turbo
+
+armral_status armral_turbo_rate_recovery(uint32_t d, uint32_t e, uint32_t rv,
+                                         const int8_t *src, int8_t *dst0,
+                                         int8_t *dst1, int8_t *dst2) {
+  heap_allocator allocator{};
+  return armral::turbo::rate_recovery(d, e, rv, src, dst0, dst1, dst2,
+                                      allocator);
+}
+
+armral_status armral_turbo_rate_recovery_noalloc(uint32_t d, uint32_t e,
+                                                 uint32_t rv, const int8_t *src,
+                                                 int8_t *dst0, int8_t *dst1,
+                                                 int8_t *dst2, void *buffer) {
+  buffer_bump_allocator allocator{buffer};
+  return armral::turbo::rate_recovery(d, e, rv, src, dst0, dst1, dst2,
+                                      allocator);
+}
+
+uint32_t armral_turbo_rate_recovery_noalloc_buffer_size(uint32_t d, uint32_t e,
+                                                        uint32_t rv) {
+  counting_allocator allocator{};
+  (void)armral::turbo::rate_recovery(d, e, rv, nullptr, nullptr, nullptr,
+                                     nullptr, allocator);
+  return allocator.required_bytes();
+}
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 22e363e..889ae79 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -45,6 +45,7 @@ using Vec_i64x2 = hn::Vec<decltype(di64x2)>;
 
 // Mask Types
 using Mask_u8x16 = hn::Mask<decltype(du8x16)>;
+using Mask_i8x16 = hn::Mask<decltype(di8x16)>;
 using Mask_u32x4 = hn::Mask<decltype(du32x4)>;
 
 // Rebind Tags
@@ -53,6 +54,13 @@ where the first tag named in the rebind tag is the old type
 which the rebind tag is created from and the second is the
 new tag type. These are used in operations where output vector
 width is different from that of the input. */
+const hn::Rebind<int8_t, decltype(di16x8)> di16x8_di8x16;
+
+// Half Vector Tags
+const hn::Half<decltype(di16x8)> di16x4;
+
+// Half Vector Types
+using Vec_i16x4 = hn::Vec<decltype(di16x4)>;
 
 /*
 Scalable vector types. The default choice should be to use
@@ -116,7 +124,7 @@ const hn::Half<decltype(du16)> du16_half;
 
 // Half Vector Types
 using Vec_i8_half = hn::Vec<decltype(di8_half)>;
-
+using Vec_i16_half = hn::Vec<decltype(di16_half)>;
 
 /* It has been found that highway implementations of
 MaskedLoad and MaskedLoadU are memory unsafe and will not
diff --git a/test/UpperPHY/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp
index cd1697d..b10c743 100644
--- a/test/UpperPHY/Turbo/turbo_test_data.hpp
+++ b/test/UpperPHY/Turbo/turbo_test_data.hpp
@@ -6,6 +6,9 @@
 
 #include "rng.hpp"
 
+#ifdef ARMRAL_ARCH_HWY
+#include <string.h>
+#endif
 #include <map>
 
 static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
@@ -19,6 +22,7 @@ static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
   // round k/8 up to the next multiple of 8 and divide by 8
   auto len = (k_bytes + 7) / 8;
 
+#ifndef ARMRAL_ARCH_HWY
   for (uint32_t i = 0; i < len; ++i) {
     uint32_t val32_0 = lcg.one<uint32_t>(&state);
     uint32_t val32_1 = lcg.one<uint32_t>(&state);
@@ -28,6 +32,18 @@ static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
       src[(i * 8) + j] = val8_8[j];
     }
   }
+#else
+  for (uint32_t i = 0; i < len; ++i) {
+    uint8_t val8_8[8];
+    uint32_t val32_0 = lcg.one<uint32_t>(&state);
+    uint32_t val32_1 = lcg.one<uint32_t>(&state);
+    memcpy(val8_8, &val32_0, sizeof(uint32_t));
+    memcpy(val8_8 + sizeof(uint32_t), &val32_1, sizeof(uint32_t));
+    for (uint32_t j = 0; j < 8 && (i * 8) + j < k_bytes; ++j) {
+      src[(i * 8) + j] = val8_8[j];
+    }
+  }
+#endif
 }
 
 // The valid values of k for Turbo coding, from TS36.212
-- 
GitLab


From dd263d9f27d2ac42c00c0097edaad51f833f9447 Mon Sep 17 00:00:00 2001
From: Will Barber <will.barber@cambridgeconsultants.com>
Date: Wed, 22 Jan 2025 10:45:05 +0000
Subject: [PATCH 19/20] Fix the no_sanitise attributes not being applied
 correctly when no_sanitize::MaskedLoad is nested

---
 .../Modulation/highway/arm_modulation.cpp     | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/UpperPHY/Modulation/highway/arm_modulation.cpp b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
index d1708b3..fa051d5 100644
--- a/src/UpperPHY/Modulation/highway/arm_modulation.cpp
+++ b/src/UpperPHY/Modulation/highway/arm_modulation.cpp
@@ -19,6 +19,7 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
                             armral_cmplx_int16_t *p_dst) {
   /* Compute the number of blocks of 2 bits in the new tail */
   uint32_t final_blck = (nbits >> 1U) & 3;
+  using Vec_u16_u8 = hn::Vec<decltype(du16_du8)>;
 
   const size_t n_lanes8 = hn::Lanes(di8);
   const size_t n_lanes16 = hn::Lanes(di16);
@@ -77,8 +78,9 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
   index = hn::InterleaveWholeLower(du16, index, index);
   index = hn::InterleaveWholeLower(du16, index, index);
   for (uint32_t i = 0; i < unrolls; i++) {
-    Vec_u16 src_bytes =
-        hn::PromoteTo(du16, no_sanitize::MaskedLoad(pred, du16_du8, p_src));
+    Vec_u16_u8 src_bytes_narrow =
+        no_sanitize::MaskedLoad(pred, du16_du8, p_src);
+    Vec_u16 src_bytes = hn::PromoteTo(du16, src_bytes_narrow);
     p_src += vl;
     Vec_u16 tbl =
         hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
@@ -93,8 +95,9 @@ void armral_qpsk_modulation(uint32_t nbits, const uint8_t *p_src,
       hn::FirstN(du16_du8, leftover_bytes);
   const uint32_t active_store_lanes = leftover_bytes * 8;
   if (leftover_bytes != 0) {
-    Vec_u16 src_bytes = hn::PromoteTo(
-        du16, no_sanitize::MaskedLoad(load_lanes, du16_du8, p_src));
+    Vec_u16_u8 src_bytes_narrow =
+        no_sanitize::MaskedLoad(load_lanes, du16_du8, p_src);
+    Vec_u16 src_bytes = hn::PromoteTo(du16, src_bytes_narrow);
     p_src += leftover_bytes;
     Vec_u16 tbl =
         hn::TableLookupLanes(src_bytes, hn::IndicesFromVec(du16, index));
@@ -399,7 +402,7 @@ static const armral_cmplx_int16_t constellation_16qam_outer_prod[256][2] = {
 
 void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
                              armral_cmplx_int16_t *p_dst) {
-
+  using Vec_u64_u8 = hn::Vec<decltype(du64_du8)>;
   /* Compute the number of bytes */
   uint32_t bytes = nbits >> 3U;
 
@@ -429,8 +432,9 @@ void armral_16qam_modulation(const uint32_t nbits, const uint8_t *p_src,
       uint32_t tail_size = blk_cnt - i;
       Mask_i64 pred_i64 = hn::FirstN(di64, tail_size);
       hn::Mask<decltype(du64_du8)> pred_u8 = hn::FirstN(du64_du8, tail_size);
-      Vec_i64 svsample = hn::PromoteTo(
-          di64, no_sanitize::MaskedLoad(pred_u8, du64_du8, p_src));
+      Vec_u64_u8 svsample_narrow =
+          no_sanitize::MaskedLoad(pred_u8, du64_du8, p_src);
+      Vec_i64 svsample = hn::PromoteTo(di64, svsample_narrow);
       p_src += blk_cnt - i;
       Vec_i64 gather = hn::MaskedGatherIndex(
           pred_i64, di64, (const int64_t *)constellation_16qam_outer_prod,
@@ -732,6 +736,7 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   uint64_t vl = Lanes(du32);
   /* Compute the blocks which will be processed using loop unroll */
   uint32_t unr_cnt = bytes / vl;
+  using Vec_i32_u8 = hn::Vec<decltype(di32_du8)>;
 
   for (uint32_t i = 0; i < unr_cnt; i++) {
     Vec_i32 index = hn::PromoteTo(di32, hn::LoadU(di32_du8, p_src));
@@ -745,8 +750,8 @@ void armral_256qam_modulation(const uint32_t nbits, const uint8_t *p_src,
   const uint32_t leftover_bytes = bytes - unr_cnt * vl;
   if (leftover_bytes != 0U) {
     hn::Mask<decltype(di32_du8)> pred = hn::FirstN(di32_du8, leftover_bytes);
-    Vec_i32 index =
-        hn::PromoteTo(di32, no_sanitize::MaskedLoad(pred, di32_du8, p_src));
+    Vec_i32_u8 index_narrow = no_sanitize::MaskedLoad(pred, di32_du8, p_src);
+    Vec_i32 index = hn::PromoteTo(di32, index_narrow);
     Vec_i32 gather =
         hn::GatherIndex(di32, (const int32_t *)constellation_256qam, index);
     hn::StoreN(gather, di32, (int32_t *)p_dst, leftover_bytes);
-- 
GitLab


From 96c4eabb520a2863dae8092477f93a25ba450df8 Mon Sep 17 00:00:00 2001
From: William Van den Aardweg
 <william.van.den.aardweg@cambridgeconsultants.com>
Date: Thu, 23 Jan 2025 10:02:36 +0000
Subject: [PATCH 20/20] Apply changes requested in Arm review

---
 CREDITS.md                                    |  18 +-
 src/UpperPHY/LDPC/highway/ldpc_encoder.cpp    |   2 +-
 .../LDPC/{ => highway}/ldpc_tables.hpp        |   0
 src/UpperPHY/LDPC/ldpc_encoder.cpp            | 929 +++++++++++++++++-
 test/UpperPHY/Turbo/turbo_test_data.hpp       |  17 +-
 5 files changed, 939 insertions(+), 27 deletions(-)
 rename src/UpperPHY/LDPC/{ => highway}/ldpc_tables.hpp (100%)

diff --git a/CREDITS.md b/CREDITS.md
index 3e5abe7..c0371e9 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -3,15 +3,15 @@ following people and organizations have contributed to Arm RAN
 Acceleration Library:
 
 - The following Google Highway implementations:
-    `src/LowerPHY/Scrambling/highway/arm_scrambling.cpp`,
-    `src/LowerPHY/SeqGenerator/highway/arm_mat_seq_generator.cpp`,
-    `src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp`,
-    `src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp`,
-    `src/UpperPHY/Modulation/highway/arm_modulation.cpp`,
-    `src/UpperPHY/Demodulation/highway/arm_demodulation.cpp`,
-    `src/UpperPHY/LDPC/highway/ldpc_encoder.cpp`,
-    `src/UpperPHY/LDPC/highway/ldpc_decoder.cpp`,
-    `src/utils/highway/bits_to_bytes.hpp`
+    `highway/arm_scrambling.cpp`,
+    `highway/arm_mat_seq_generator.cpp`,
+    `highway/arm_convolutional_encoder.cpp`,
+    `highway/arm_convolutional_decoder.cpp`,
+    `highway/arm_modulation.cpp`,
+    `highway/arm_demodulation.cpp`,
+    `highway/ldpc_encoder.cpp`,
+    `highway/ldpc_decoder.cpp`,
+    `highway/bits_to_bytes.hpp`
     have been contributed by Cambridge Consultants. See
     <https://gitlab.arm.com/networking/ral/-/merge_requests/28>.
 
diff --git a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
index f0db1ed..8f165c0 100644
--- a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
@@ -3,7 +3,7 @@
     SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "../ldpc_coding.hpp"
-#include "../ldpc_tables.hpp"
+#include "ldpc_tables.hpp"
 #include "utils/allocators.hpp"
 #include "utils/bits_to_bytes.hpp"
 
diff --git a/src/UpperPHY/LDPC/ldpc_tables.hpp b/src/UpperPHY/LDPC/highway/ldpc_tables.hpp
similarity index 100%
rename from src/UpperPHY/LDPC/ldpc_tables.hpp
rename to src/UpperPHY/LDPC/highway/ldpc_tables.hpp
diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp
index ff205cf..5e97e35 100644
--- a/src/UpperPHY/LDPC/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp
@@ -4,7 +4,6 @@
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
-#include "ldpc_tables.hpp"
 #include "utils/allocators.hpp"
 #include "utils/bits_to_bytes.hpp"
 
@@ -17,6 +16,934 @@
 #include <cstring>
 
 namespace {
+// The base graphs are given in compressed sparse row format. We need three
+// arrays for this.  Firstly we have the row start indices, which stores the
+// indices into another array which indicates where the row starts. The next
+// array stores the indices of the columns which are non-zero in a row. Finally,
+// we have an array of values corresponding to the non-zero entries in the
+// matrix.
+// For example, `bg1_row_start[3]` is the index into `bg1_columns` for the
+// start of the fourth row, and `bg1_columns[bg1_row_starts[3]]` is the index of
+// a column in the fourth row of the matrix which contains a non-zero value.
+
+// Base graph 1 is taken from 3GPP standard document 38.212 table 5.3.2-2.
+
+// The row start indices for base graph 1
+const uint32_t bg1_row_start[] = {
+    0,   19,  38,  57,  76,  79,  87,  96,  103, 113, 122, 129,
+    137, 144, 150, 157, 164, 170, 176, 182, 188, 194, 200, 205,
+    210, 216, 221, 226, 230, 235, 240, 245, 250, 255, 260, 265,
+    270, 275, 279, 284, 289, 293, 298, 302, 307, 312, 316};
+
+// clang-format off
+const uint32_t bg1_columns[] = {
+    0,  1,  2,  3,  5,  6,  9,  10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 23, // row 0: 19
+    0,  2,  3,  4,  5,  7,  8,  9,  11, 12, 14, 15, 16, 17, 19, 21, 22, 23, 24, // row 1: 19
+    0,  1,  2,  4,  5,  6,  7,  8,  9,  10, 13, 14, 15, 17, 18, 19, 20, 24, 25, // row 2: 19
+    0,  1,  3,  4,  6,  7,  8,  10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 25, // row 3: 19
+    0,  1,  26,                             // row 4: 3
+    0,  1,  3,  12, 16, 21, 22, 27,         // row 5: 8
+    0,  6,  10, 11, 13, 17, 18, 20, 28,     // row 6: 9
+    0,  1,  4,  7,  8,  14, 29,             // row 7: 7
+    0,  1,  3,  12, 16, 19, 21, 22, 24, 30, // row 8: 10
+    0,  1,  10, 11, 13, 17, 18, 20, 31,     // row 9: 9
+    1,  2,  4,  7,  8,  14, 32,             // row 10: 7
+    0,  1,  12, 16, 21, 22, 23, 33,         // row 11: 8
+    0,  1,  10, 11, 13, 18, 34,             // row 12: 7
+    0,  3,  7,  20, 23, 35,                 // row 13: 6
+    0,  12, 15, 16, 17, 21, 36,             // row 14: 7
+    0,  1,  10, 13, 18, 25, 37,             // row 15: 7
+    1,  3,  11, 20, 22, 38,                 // row 16: 6
+    0,  14, 16, 17, 21, 39,                 // row 17: 6
+    1,  12, 13, 18, 19, 40,                 // row 18: 6
+    0,  1,  7,  8,  10, 41,                 // row 19: 6
+    0,  3,  9,  11, 22, 42,                 // row 20: 6
+    1,  5,  16, 20, 21, 43,                 // row 21: 6
+    0,  12, 13, 17, 44,                     // row 22: 5
+    1,  2,  10, 18, 45,                     // row 23: 5
+    0,  3,  4,  11, 22, 46,                 // row 24: 6
+    1,  6,  7,  14, 47,                     // row 25: 5
+    0,  2,  4,  15, 48,                     // row 26: 5
+    1,  6,  8,  49,                         // row 27: 4
+    0,  4,  19, 21, 50,                     // row 28: 5
+    1,  14, 18, 25, 51,                     // row 29: 5
+    0,  10, 13, 24, 52,                     // row 30: 5
+    1,  7,  22, 25, 53,                     // row 31: 5
+    0,  12, 14, 24, 54,                     // row 32: 5
+    1,  2,  11, 21, 55,                     // row 33: 5
+    0,  7,  15, 17, 56,                     // row 34: 5
+    1,  6,  12, 22, 57,                     // row 35: 5
+    0,  14, 15, 18, 58,                     // row 36: 5
+    1,  13, 23, 59,                         // row 37: 4
+    0,  9,  10, 12, 60,                     // row 38: 5
+    1,  3,  7,  19, 61,                     // row 39: 5
+    0,  8,  17, 62,                         // row 40: 4
+    1,  3,  9,  18, 63,                     // row 41: 5
+    0,  4,  24, 64,                         // row 42: 4
+    1,  16, 18, 25, 65,                     // row 43: 5
+    0,  7,  9,  22, 66,                     // row 44: 5
+    1,  6,  10, 67                          // row 45: 4
+};
+
+// The shifts are organized by row, and then by index set. Each line in the
+// following represents the shifts in one index set for one block row of the
+// matrix. Indexing into the array works as follows. If we are using index set k
+// for k in [0, 7], and are on block row i, then the indexing function from k, i
+// to j is ind(k, i) = 8 * bg1_row_start[i] + (bg1_row_start[i+1] -
+// bg1_row_start[i]) * k
+const uint32_t bg1_shifts[] = {
+    250, 69,  226, 159, 100, 10,  59,  229, 110, 191, 9,   195, 23,  190, 35, 239, 31,   1,   0, // row 0
+    307, 19,  50,  369, 181, 216, 317, 288, 109, 17,  357, 215, 106, 242, 180, 330, 346, 1,   0,
+    73,  15,  103, 49,  240, 39,  15,  162, 215, 164, 133, 298, 110, 113, 16,  189, 32,  1,   0,
+    223, 16,  94,  91,  74,  10,  0,   205, 216, 21,  215, 14,  70,  141, 198, 104, 81,  1,   0,
+    211, 198, 188, 186, 219, 4,   29,  144, 116, 216, 115, 233, 144, 95,  216, 73,  261, 1,   0,
+    294, 118, 167, 330, 207, 165, 243, 250, 1,   339, 201, 53,  347, 304, 167, 47,  188, 1,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    135, 227, 126, 134, 84,  83,  53,  225, 205, 128, 75,  135, 217, 220, 90,  105, 137, 1,   0,
+
+    2,   239, 117, 124, 71,  222, 104, 173, 220, 102, 109, 132, 142, 155, 255, 28,  0,   0,   0, // row 1
+    76,  76,  73,  288, 144, 331, 331, 178, 295, 342, 217, 99,  354, 114, 331, 112, 0,   0,   0,
+    303, 294, 27,  261, 161, 133, 4,   80,  129, 300, 76,  266, 72,  83,  260, 301, 0,   0,   0,
+    141, 45,  151, 46,  119, 157, 133, 87,  206, 93,  79,  9,   118, 194, 31,  187, 0,   0,   0,
+    179, 162, 223, 256, 160, 76,  202, 117, 109, 15,  72,  152, 158, 147, 156, 119, 0,   0,   0,
+    77,  225, 96,  338, 268, 112, 302, 50,  167, 253, 334, 242, 257, 133, 9,   302, 0,   0,   0,
+    22,  11,  124, 0,   10,  0,   0,   2,   16,  60,  0,   6,   30,  0,   168, 31,  105, 0,   0,
+    96,  236, 136, 221, 128, 92,  172, 56,  11,  189, 95,  85,  153, 87,  163, 216, 0,   0,   0,
+
+    106, 111, 185, 63,  117, 93,  229, 177, 95,  39,  142, 225, 225, 245, 205, 251, 117, 0,   0, // row 2
+    205, 250, 328, 332, 256, 161, 267, 160, 63,  129, 200, 88,  53,  131, 240, 205, 13,  0,   0,
+    68,  7,   80,  280, 38,  227, 202, 200, 71,  106, 295, 283, 301, 184, 246, 230, 276, 0,   0,
+    207, 203, 31,  176, 180, 186, 95,  153, 177, 70,  77,  214, 77,  198, 117, 223, 90,  0,   0,
+    258, 167, 220, 133, 243, 202, 218, 63,  0,   3,   74,  229, 0,   216, 269, 200, 234, 0,   0,
+    226, 35,  213, 302, 111, 265, 128, 237, 294, 127, 110, 286, 125, 131, 163, 210, 7,   0,   0,
+    132, 37,  21,  180, 4,   149, 48,  38,  122, 195, 155, 28,  85,  47,  179, 42,  66,  0,   0,
+    189, 4,   225, 151, 236, 117, 179, 92,  24,  68,  6,   101, 33,  96,  125, 67,  230, 0,   0,
+
+    121, 89,  84,  20,  150, 131, 243, 136, 86,  246, 219, 211, 240, 76,  244, 144, 12,  1,   0, // row 3
+    276, 87,  0,   275, 199, 153, 56,  132, 305, 231, 341, 212, 304, 300, 271, 39,  357, 1,   0,
+    220, 208, 30,  197, 61,  175, 79,  281, 303, 253, 164, 53,  44,  28,  77,  319, 68,  1,   0,
+    201, 18,  165, 5,   45,  142, 16,  34,  155, 213, 147, 69,  96,  74,  99,  30,  158, 1,   0,
+    187, 145, 166, 108, 82,  132, 197, 41,  162, 57,  36,  115, 242, 165, 0,   113, 108, 1,   0,
+    97,  94,  49,  279, 139, 166, 91,  106, 246, 345, 269, 185, 249, 215, 143, 121, 121, 1,   0,
+    4,   6,   33,  113, 49,  21,  6,   151, 83,  154, 87,  5,   92,  173, 120, 2,   142, 0,   0,
+    128, 23,  162, 220, 43,  186, 96,  1,   216, 22,  24,  167, 200, 32,  235, 172, 219, 1,   0,
+
+    157, 102, 0, // row 4
+    332, 181, 0,
+    233, 205, 0,
+    170, 10,  0,
+    246, 235, 0,
+    42,  256, 0,
+    24,  204, 0,
+    64,  211, 0,
+
+    205, 236, 194, 231, 28,  123, 115, 0, // row 5
+    195, 14,  115, 166, 241, 51,  157, 0,
+    83,  292, 50,  318, 201, 267, 279, 0,
+    164, 59,  86,  80,  182, 130, 153, 0,
+    261, 181, 72,  283, 254, 79,  144, 0,
+    219, 130, 251, 322, 295, 258, 283, 0,
+    185, 100, 24,  65,  207, 161, 72,  0,
+    2,   171, 47,  143, 210, 180, 180, 0,
+
+    183, 22,  28,  67,  244, 11,  157, 211, 0, // row 6
+    278, 257, 1,   351, 92,  253, 18,  225, 0,
+    289, 21,  293, 13,  232, 302, 138, 235, 0,
+    158, 119, 113, 21,  63,  51,  136, 116, 0,
+    80,  144, 169, 90,  59,  177, 151, 108, 0,
+    294, 73,  330, 99,  172, 150, 284, 305, 0,
+    6,   27,  163, 50,  48,  24,  38,  91,  0,
+    199, 22,  23,  100, 92,  207, 52,  13,  0,
+
+    220, 44,  159, 31,  167, 104, 0, // row 7
+    9,   62,  316, 333, 290, 114, 0,
+    12,  88,  207, 50,  25,  76,  0,
+    17,  76,  104, 100, 150, 158, 0,
+    169, 189, 154, 184, 104, 164, 0,
+    3,   103, 224, 297, 215, 39,  0,
+    145, 88,  112, 153, 159, 76,  0,
+    77,  146, 209, 32,  166, 18,  0,
+
+    112, 4,   7,   211, 102, 164, 109, 241, 90,  0, // row 8
+    307, 179, 165, 18,  39,  224, 368, 67,  170, 0,
+    295, 133, 130, 231, 296, 110, 269, 245, 154, 0,
+    33,  95,  4,   217, 204, 39,  58,  44,  201, 0,
+    54,  0,   252, 41,  98,  46,  15,  230, 54,  0,
+    348, 75,  22,  312, 224, 17,  59,  314, 244, 0,
+    172, 2,   131, 141, 96,  99,  101, 35,  116, 0,
+    181, 105, 141, 223, 177, 145, 199, 153, 38,  0,
+
+    103, 182, 109, 21,  142, 14,  61,  216, 0, // row 9
+    366, 232, 321, 133, 57,  303, 63,  82,  0,
+    189, 244, 36,  286, 151, 267, 135, 209, 0,
+    9,   37,  213, 105, 89,  185, 109, 218, 0,
+    162, 159, 93,  134, 45,  132, 76,  209, 0,
+    156, 88,  293, 111, 92,  152, 23,  337, 0,
+    6,   10,  145, 53,  201, 4,   164, 173, 0,
+    169, 12,  206, 221, 17,  212, 92,  205, 0,
+
+    98,  149, 167, 160, 49,  58,  0, // row 10
+    101, 339, 274, 111, 383, 354, 0,
+    14,  80,  211, 75,  161, 311, 0,
+    82,  165, 174, 19,  194, 103, 0,
+    178, 1,   28,  267, 234, 201, 0,
+    175, 253, 27,  231, 49,  267, 0,
+    126, 77,  156, 16,  12,  70,  0,
+    116, 151, 70,  230, 115, 84,  0,
+
+    77,  41,  83,  182, 78,  252, 22,  0, // row 11
+    48,  102, 8,   47,  188, 334, 115, 0,
+    16,  147, 290, 289, 177, 43,  280, 0,
+    52,  11,  2,   35,  32,  84,  201, 0,
+    55,  23,  274, 181, 273, 39,  26,  0,
+    25,  322, 200, 351, 166, 338, 192, 0,
+    184, 194, 123, 16,  104, 109, 124, 0,
+    45,  115, 134, 1,   152, 165, 107, 0,
+
+    160, 42,  21,  32,  234, 7,   0, // row 12
+    77,  186, 174, 232, 50,  74,  0,
+    229, 235, 169, 48,  105, 52,  0,
+    142, 175, 136, 3,   28,  182, 0,
+    225, 162, 244, 151, 238, 243, 0,
+    123, 217, 142, 110, 176, 76,  0,
+    6,   20,  203, 153, 104, 207, 0,
+    186, 215, 124, 180, 98,  80,  0,
+
+    177, 248, 151, 185, 62,  0, // row 13
+    313, 177, 266, 115, 370, 0,
+    39,  302, 303, 160, 37,  0,
+    81,  56,  72,  217, 78,  0,
+    231, 0,   216, 47,  36,  0,
+    311, 251, 265, 94,  81,  0,
+    52,  147, 1,   16,  46,  0,
+    220, 185, 154, 178, 150, 0,
+
+    206, 55,  206, 127, 16,  229, 0, // row 14
+    142, 248, 137, 89,  347, 12,  0,
+    78,  299, 54,  61,  179, 258, 0,
+    14,  175, 211, 191, 51,  43,  0,
+    0,   186, 253, 16,  0,   79,  0,
+    22,  322, 277, 156, 66,  78,  0,
+    1,   202, 118, 130, 1,   2,   0,
+    124, 144, 182, 95,  72,  76,  0,
+
+    40,  96,  65,  63,  75,  179, 0, // row 15
+    241, 2,   210, 318, 55,  269, 0,
+    229, 290, 60,  130, 184, 51,  0,
+    90, 120, 131, 209, 209, 81,  0,
+    170, 0,   183, 108, 68,  64,  0,
+    176, 348, 15,  81,  176, 113, 0,
+    173, 6,   81,  182, 53,  46,  0,
+    39,  138, 220, 173, 142, 49,  0,
+
+    64,  49,  49,  51,  154, 0, // row 16
+    13,  338, 57,  289, 57,  0,
+    69,  140, 45,  115, 300, 0,
+    154, 164, 43, 189, 101, 0,
+    270, 13,  99,  54,  0,   0,
+    190, 293, 332, 331, 114, 0,
+    88,  198, 160, 122, 182, 0,
+    78,  152, 84,  5,   205, 0,
+
+    7,   164, 59,  1,   144, 0, // row 17
+    260, 303, 81,  358, 375, 0,
+    257, 147, 128, 51,  228, 0,
+    56,  110, 200, 63,  4,   0,
+    153, 137, 0,   0,   162, 0,
+    110, 228, 247, 116, 190, 0,
+    91,  184, 30,  3,   155, 0,
+    183, 112, 106, 219, 129, 0,
+
+    42,  233, 8,   155, 147, 0, // row 18
+    130, 163, 280, 132, 4,   0,
+    260, 294, 291, 141, 295, 0,
+    199, 110, 200, 143, 186, 0,
+    161, 151, 0,   241, 144, 0,
+    47,  286, 246, 181, 73,  0,
+    1,   41,  167, 68,  148, 0,
+    183, 215, 180, 143, 14,  0,
+
+    60,  73,  72,  127, 224, 0, // row 19
+    145, 213, 344, 242, 197, 0,
+    64,  181, 101, 270, 41,  0,
+    8,   6,   103, 198, 8,   0,
+    0,   0,   118, 144, 0,   0,
+    87,  110, 147, 258, 204, 0,
+    12,  6,   166, 184, 191, 0,
+    179, 108, 159, 138, 196, 0,
+
+    151, 186, 217, 47,  160, 0, // row 20
+    187, 206, 264, 341, 59,  0,
+    301, 162, 40,  130, 10,  0,
+    105, 210, 121, 214, 183, 0,
+    265, 81,  90,  144, 228, 0,
+    89,  65,  155, 244, 30,  0,
+    6,   12,  15,  5,   30,  0,
+    77,  187, 203, 167, 130, 0,
+
+    249, 121, 109, 131, 171, 0, // row 21
+    205, 102, 328, 213, 97,  0,
+    79,  175, 132, 283, 103, 0,
+    192, 131, 220, 50,  106, 0,
+    64,  46,  266, 9,   18,  0,
+    162, 264, 346, 143, 109, 0,
+    6,   86,  96,  42,  199, 0,
+    197, 122, 215, 65,  216, 0,
+
+    64,  142, 188, 158, 0, // row 22
+    30,  11,  233, 22,  0,
+    177, 20,  55,  316, 0,
+    53,  0,   3,   148, 0,
+    72,  189, 72,  257, 0,
+    280, 157, 236, 113, 0,
+    44,  58,  130, 131, 0,
+    25,  47,  126, 178, 0,
+
+    156, 147, 170, 152, 0, // row 23
+    24,  89,  61,  27,  0,
+    249, 50,  133, 105, 0,
+    88,  203, 168, 122, 0,
+    180, 0,   0,   165, 0,
+    18,  6,   181, 304, 0,
+    45,  18,  132, 100, 0,
+    185, 127, 117, 199, 0,
+
+    112, 86,  236, 116, 222, 0, // row 24
+    298, 158, 235, 339, 234, 0,
+    289, 280, 110, 187, 281, 0,
+    49,  157, 64, 193, 124,  0,
+    236, 199, 0,   266, 0,   0,
+    38,  170, 249, 288, 194, 0,
+    9,   125, 191, 28,  6,   0,
+    32,  178, 2,   156, 58,  0,
+
+    23,  136, 116, 182, 0, // row 25
+    72,  17,  383, 312, 0,
+    172, 295, 96,  46,  0,
+    1,   166, 65,  81,  0,
+    205, 0,   0,   183, 0,
+    279, 255, 111, 54,  0,
+    4,   74,  16,  28,  0,
+    27,  141, 11,  181, 0,
+
+    195, 243, 215, 61,  0, // row 26
+    71,  81,  76,  136, 0,
+    270, 110, 318, 67,  0,
+    107, 176, 212, 127, 0,
+    0,   0,   0,   277, 0,
+    325, 326, 226, 99,  0,
+    21,  142, 192, 197, 0,
+    163, 131, 169, 98,  0,
+
+    25,  104, 194, 0, // row 27
+    194, 194, 101, 0,
+    210, 29,  304, 0,
+    208, 141, 174, 0,
+    45,  36,  72,  0,
+    91,  326, 268, 0,
+    98,  140, 22,  0,
+    165, 232, 9,   0,
+
+    128, 165, 181, 63,  0, // row 28
+    222, 19,  244, 274, 0,
+    11,  293, 50,  234, 0,
+    146, 153, 217, 114, 0,
+    275, 0,   155, 62,  0,
+    102, 1,   40,  167, 0,
+    4,   1,   40,  93,  0,
+    32,  43,  200, 205, 0,
+
+    86,  236, 84,  6,   0, // row 29
+    252, 5,   147, 78,  0,
+    27,  308, 117, 29,  0,
+    150, 11,  53,  68,  0,
+    0,   180, 0,   42,  0,
+    273, 104, 243, 107, 0,
+    92,  136, 106, 6,   0,
+    232, 32,  118, 103, 0,
+
+    216, 73,  120, 9,   0, // row 30
+    159, 229, 260, 90,  0,
+    91,  23,  105, 135, 0,
+    34,  130, 210, 123, 0,
+    0,   90,  252, 173, 0,
+    171, 16,  95,  212, 0,
+    2,   88,  112, 20,  0,
+    170, 199, 26,  105, 0,
+
+    95,  177, 172, 61,  0, // row 31
+    100, 215, 258, 256, 0,
+    222, 308, 66,  162, 0,
+    175, 49,  177, 128, 0,
+    144, 144, 166, 19,  0,
+    101, 297, 279, 222, 0,
+    4,   49,  125, 194, 0,
+    73,  149, 175, 108, 0,
+
+    221, 112, 199, 121, 0, // row 32
+    102, 201, 175, 287, 0,
+    210, 22,  271, 217, 0,
+    192, 209, 58,  30,  0,
+    0,   211, 36,  162, 0,
+    351, 265, 338, 83,  0,
+    6,   126, 63,  20,  0,
+    103, 110, 151, 211, 0,
+
+    2,   187, 41,  211, 0, // row 33
+    323, 8,   361, 105, 0,
+    170, 20,  140, 33,  0,
+    114, 49,  161, 137, 0,
+    0,   0,   76,  18,  0,
+    56,  304, 141, 101, 0,
+    10,  30,  6,   92,  0,
+    199, 132, 172, 65,  0,
+
+    127, 167, 164, 159, 0, // row 34
+    230, 148, 202, 312, 0,
+    187, 296, 5,   44,  0,
+    82,  186, 68,  150, 0,
+    197, 0,   108, 0,   0,
+    60,  320, 112, 54,  0,
+    4,   153, 197, 155, 0,
+    161, 237, 142, 180, 0,
+
+    161, 197, 207, 103, 0, // row 35
+    320, 335, 2,   266, 0,
+    207, 158, 55,  285, 0,
+    192, 173, 26,  187, 0,
+    199, 278, 0,   205, 0,
+    100, 210, 195, 268, 0,
+    4,   45,  168, 185, 0,
+    231, 174, 145, 100, 0,
+
+    37,  105, 51,  120, 0, // row 36
+    210, 313, 297, 21,  0,
+    259, 179, 178, 160, 0,
+    222, 157, 0,   6,   0,
+    216, 16,  0,   0,   0,
+    135, 15,  35,  188, 0,
+    6,   200, 177, 43,  0,
+    11,  207, 42,  100, 0,
+
+    198, 220, 122, 0, // row 37
+    269, 82,  115, 0,
+    298, 15,  115, 0,
+    81,  195, 138, 0,
+    72,  144, 0,   0,
+    319, 236, 85,  0,
+    82,  2,   135, 0,
+    59,  204, 161, 0,
+
+    167, 151, 157, 163, 0, // row 38
+    185, 177, 289, 214, 0,
+    151, 179, 64,  181, 0,
+    123, 90,  73,  10,  0,
+    190, 0,   0,   0,   0,
+    164, 196, 209, 246, 0,
+    91,  64,  198, 100, 0,
+    121, 90,  26,  140, 0,
+
+    173, 139, 149, 0,   0, // row 39
+    258, 93,  346, 297, 0,
+    102, 77,  192, 208, 0,
+    12,  77,  49,  114, 0,
+    153, 0,   165, 117, 0,
+    236, 264, 37,  272, 0,
+    4,   28,  109, 188, 0,
+    115, 188, 168, 52,  0,
+
+    157, 137, 149, 0, // row 40
+    175, 37,  312, 0,
+    32,  80,  197, 0,
+    67,  45,  96,  0,
+    216, 144, 2,   0,
+    304, 237, 135, 0,
+    10,  84,  12,  0,
+    4,   103, 30,  0,
+
+    167, 173, 139, 151, 0, // row 41
+    52,  314, 139, 288, 0,
+    154, 47,  124, 207, 0,
+    23,  215, 60,  167, 0,
+    0,   0,   0,   183, 0,
+    123, 77,  25,  272, 0,
+    2,   75,  142, 128, 0,
+    53,  189, 215, 24,  0,
+
+    149, 157, 137, 0, // row 42
+    113, 14,  218, 0,
+    226, 65,  126, 0,
+    114, 91,  78,  0,
+    27,  0,   35,  0,
+    288, 83,  17,  0,
+    163, 10,  162, 0,
+    222, 170, 71,  0,
+
+    151, 163, 173, 139, 0, // row 43
+    113, 132, 114, 168, 0,
+    228, 69,  176, 102, 0,
+    206, 22,  134, 161, 0,
+    52,  243, 0,   270, 0,
+    210, 3,   53,  167, 0,
+    1,   163, 99,  98,  0,
+    22,  127, 49,  125, 0,
+
+    139, 157, 163, 173, 0, // row 44
+    80,  78,  163, 274, 0,
+    234, 227, 259, 260, 0,
+    84,  4,   9,   12,  0,
+    18,  0,   0,   57,  0,
+    79,  244, 293, 272, 0,
+    4,   6,   142, 3,   0,
+    191, 211, 187, 148, 0,
+
+    149, 151, 167, 0, // row 45
+    135, 149, 15,  0,
+    101, 228, 126, 0,
+    184, 121, 29,  0,
+    168, 0,   144, 0,
+    82,  67,  235, 0,
+    181, 45,  153, 0,
+    177, 114, 93,  0
+};
+
+// clang-format on
+
+// Base graph 2 is taken from 3GPP standard document 38.212 table 5.3.2-3.
+// The format is consistent with the base graph 1 described above
+
+// The row start indices for base graph 2
+const uint32_t bg2_row_start[] = {
+    0,   8,   18,  26,  36,  40,  46,  52,  58,  62,  67,  72,  77,  81,  86,
+    91,  95,  100, 105, 109, 113, 117, 121, 124, 128, 132, 135, 140, 143, 147,
+    150, 155, 158, 162, 166, 170, 174, 178, 181, 185, 189, 193, 197};
+
+// clang-format off
+const uint32_t bg2_columns[] = {
+  0,  1,  2,  3,  6,  9,  10,  11,           // row 0: 8
+  0,  3,  4,  5,  6,  7,  8,   9,   11,  12, // row 1: 10
+  0,  1,  3,  4,  8,  10, 12,  13,           // row 2: 8
+  1,  2,  4,  5,  6,  7,  8,   9,  10,  13,  // row 3: 10
+  0,  1,  11, 14,                            // row 4: 4
+  0,  1,  5,  7,  11,  15,                   // row 5: 6
+  0,  5,  7,  9,  11,  16,                   // row 6: 6
+  1,  5,  7,  11, 13,  17,                   // row 7: 6
+  0,  1,  12, 18,                            // row 8: 4
+  1,  8,  10, 11, 19,                        // row 9: 5
+  0,  1,  6,  7,  20,                        // row 10: 5
+  0,  7,  9,  13, 21,                        // row 11: 5
+  1,  3,  11, 22,                            // row 12: 4
+  0,  1,  8,  13, 23,                        // row 13: 5
+  1,  6,  11, 13, 24,                        // row 14: 5
+  0,  10, 11, 25,                            // row 15: 4
+  1,  9,  11, 12, 26,                        // row 16: 5
+  1,  5,  11, 12, 27,                        // row 17: 5
+  0,  6,  7,  28,                            // row 18: 4
+  0,  1,  10, 29,                            // row 19: 4
+  1,  4,  11, 30,                            // row 20: 4
+  0,  8,  13, 31,                            // row 21: 4
+  1,  2,  32,                                // row 22: 3
+  0,  3,  5,  33,                            // row 23: 4
+  1,  2,  9,  34,                            // row 24: 4
+  0,  5,  35,                                // row 25: 3
+  2,  7,  12,  13,  36,                      // row 26: 5
+  0,  6,  37,                                // row 27: 3
+  1,  2,  5,  38,                            // row 28: 4
+  0,  4,  39,                                // row 29: 3
+  2,  5,  7,  9,  40,                        // row 30: 5
+  1,  13, 41,                                // row 31: 3
+  0,  5,  12, 42,                            // row 32: 4
+  2,  7,  10, 43,                            // row 33: 4
+  0,  12, 13, 44,                            // row 34: 4
+  1,  5,  11, 45,                            // row 35: 4
+  0,  2,  7,  46,                            // row 36: 4
+  10, 13, 47,                                // row 37: 3
+  1,  5,  11,  48,                           // row 38: 4
+  0,  7,  12,  49,                           // row 39: 4
+  2,  10, 13,  50,                           // row 40: 4
+  1,  5,  11,  51                            // row 41: 4
+};
+
+const uint32_t bg2_shifts[] = {
+  9,   117, 204, 26, 189, 205,  0, 0, // row 0
+  174, 97,  166, 66,  71,  172, 0, 0,
+  0,   0,   0,   0,   0,   0,   0, 0,
+  72,  110, 23,  181, 95,  8,   1, 0,
+  3,   26,  53,  35,  115, 127, 0, 0,
+  156, 143, 14,  3,   40,  123, 0, 0,
+  143, 19,  176, 165, 196, 13,  0, 0,
+  145, 131, 71,  21,  23,  112, 1, 0,
+
+  167, 166, 253, 125, 226, 156, 224, 252, 0, 0,  // row 1
+  27,  36,  48,  92,  31,  187, 185, 3,   0, 0,
+  137, 124, 0,   0,   88,  0,   0,   55,  0, 0,
+  53,  156, 115, 156, 115, 200, 29,  31,  0, 0,
+  19,  94,  104, 66,  84,  98,  69,  50,  0, 0,
+  17,  65,  63,  1,   55,  37,  171, 133, 0, 0,
+  18,  27,  3,   102, 185, 17,  14,  180, 0, 0,
+  142, 174, 183, 27,  96,  23,  9,   167, 0, 0,
+
+  81,  114,  44,  52,  240,  1,  0,  0, // row 2
+  25,  114,  117, 110, 114,  1,  0,  0,
+  20,  94,   99,  9,   108,  1,  0,  0,
+  152, 131,  46,  191, 91,   0,  0,  0,
+  95,  106,  92,  110, 111,  1,  0,  0,
+  98,  168,  107,  82, 142,  1,  0,  0,
+  126, 163,  47,  183, 132,  1,  0,  0,
+  74,  31,   3,   53,  155,  0,  0,  0,
+
+  8,   58,  158, 104, 209, 54,  18,  128,  0,  0, // row 3
+  136, 175, 113, 72,  123, 118, 28,  186,  0,  0,
+  38,  15,  102, 146, 12,  57,  53,  46,   0,  0,
+  185, 6,   36,  124, 124, 110, 156, 133,  1,  0,
+  120, 121, 22,  4,   73,  49,  128, 79,   0,  0,
+  53,  174, 174, 127, 17,  89,  17,  105,  0,  0,
+  36,  48,  18,  111, 203, 3,   191, 160,  0,  0,
+  239, 171, 95,  110, 159, 199, 43,  75,   1,  0,
+
+  179,  214,  71,  0,  // row 4
+  72,   74,   29,  0,
+  0,   136,   157, 0,
+  200, 16,    101, 0,
+  42,  24,    51,  0,
+  86,  67,    83,  0,
+  43,  27,    117, 0,
+  29,  140,   180, 0,
+
+  231,  41,  194,  159,  103,  0, // row 5
+  10,   44,  121,  80,   48,   0,
+  0,   131,  142,  141,  64,   0,
+  185, 138,  170,  219,  193,  0,
+  40,  140,  84,   137,  71,   0,
+  79,  84,   35,   103,  60,   0,
+  136, 49,   36,   132,  62,   0,
+  121, 41,   169,  88,  207,   0,
+
+  155, 228,  45,  28,  158,   0, // row 6
+  129, 92,   100, 49,  184,   0,
+  0,   124,  99,  45,  148,   0,
+  123, 55,   31,  222, 209,   0,
+  109, 87,   107, 133, 139,   0,
+  47,  154,  10,  155, 29,    0,
+  7,   34,   198, 168, 12,    0,
+  137, 72,   172, 124, 56,    0,
+
+  129, 147,  140, 3,   116,   0,  // row 7
+  80,  186,  16,  102, 143,   0,
+  0,   45,   148, 96,  78,    0,
+  103, 13,   105, 150, 181,   0,
+  97,  135,  35,  108, 65,    0,
+  48,  125,  24,  47,  55,    0,
+  163, 78,   143, 107, 58,    0,
+  86,  186,  87,  172, 154,   0,
+
+  142, 94,   230, 0, // row 8
+  118, 70,   152, 0,
+  0,   65,   87,  0,
+  147, 43,   152,  0,
+  70,  69,   88,  0,
+  53,  31,  161,  0,
+  101, 177, 22,   0,
+  176, 169, 225,  0,
+
+  203,  205,  61,  247,  0,  // row 9
+  28,   132,  185, 178,  0,
+  0,    97,   51,  85,   0,
+  2,    30,   184, 83,   0,
+  97,   40,   24,  49,   0,
+  104,  142,  99,  64,   0,
+  186,  27,   205, 81,   0,
+  167,  238,  48,  68,   0,
+
+  11,  185,  0,   117,  0, // row 10
+  59,  104,  22,  52,   0,
+  0,   17,   156, 20,   0,
+  174, 150,  8,   56,   0,
+  46,  41,   101, 96,   0,
+  111, 25,   174, 23,   0,
+  125, 60,   177, 51,   0,
+  38,  217,  208, 232,  0,
+
+  11,  236,  210, 56,   0, // row 11
+  32,  92,   174, 154,  0,
+  0,   7,    4,   2,    0,
+  99,  138,  110, 99,   0,
+  28,  30,   116, 64,   0,
+  91,  175,  24,  141,  0,
+  39,  29,   35,  8,    0,
+  178, 214,  168, 51,   0,
+
+  63,  111,  14,  0,  // row 12
+  39,  93,   11,  0,
+  0,   113,  48,  0,
+  46,  217,  109, 0,
+  33,  122,  131, 0,
+  122, 11,   4,   0,
+  18,  155,  49,  0,
+  124, 122,  72,  0,
+
+  83,  2,   38,  222,  0,  // row 13
+  49,  125, 35,  166,  0,
+  0,   112, 102, 26,   0,
+  37,  113, 143, 140,  0,
+  76,  37,  62,  47,   0,
+  29,  91,  27,  127,  0,
+  32,  53,  95,  186,  0,
+  48,  57,  167, 219,  0,
+
+  115, 145, 3,   232,  0, // row 14
+  19,  118, 21,  163,  0,
+  0,   138, 57,  27,   0,
+  36,  95,  40,  116,  0,
+  143, 51,  130, 97,   0,
+  11,  145, 8,   166,  0,
+  91,  20,  52,  109,  0,
+  82,  232, 204, 162,  0,
+
+  51,  175,  213,  0,  // row 15
+  68,  63,   81,   0,
+  0,   73,   99,   0,
+  116, 200,  110,  0,
+  139, 96,   128,  0,
+  137, 103,  40,   0,
+  174, 108,  102,  0,
+  38,  217,  157,  0,
+
+  203, 142,  8,    242,  0, // row 16
+  87,  177,  135,  64,   0,
+  0,   79,   111,  143,  0,
+  75,  158,  134,  97,   0,
+  48,  9,    28,   8,    0,
+  78,  158,  17,   165,  0,
+  125, 31,   54,   176,  0,
+  170, 23,  175,   202,  0,
+
+  254,  124,  114,  64,  0, // row 17
+  158,  23,   9,    6,   0,
+  0,    24,   109,  18,  0,
+  48,   132,  206,  2,   0,
+  120,  43,   65,   42,  0,
+  134,  23,   62,   163, 0,
+  57,   201,  142,  35,  0,
+  196,  173,  195,  218, 0,
+
+  220,  194,  50,  0,  // row 18
+  186,  6,    46,  0,
+  0,    18,   86,  0,
+  68,   16,   156, 0,
+  17,   106,  142, 0,
+  173,  31,   22,  0,
+  129,  203,  140, 0,
+  128,  211,  210, 0,
+
+  87,   20,   185,  0,  // row 19
+  58,   42,   156,  0,
+  0,   158,   154,  0,
+  35,  138,   86,   0,
+  79,  28,    41,   0,
+  13,  135,   145,  0,
+  110, 124,   52,   0,
+  39,  84,    88,   0,
+
+  26,  105,  29,   0, // row 20
+  76,  61,  153,   0,
+  0,   148, 104,   0,
+  6,   20,  141,   0,
+  2,   103, 78,    0,
+  128, 52,  173,   0,
+  196, 35,  114,   0,
+  117, 227, 6,     0,
+
+  76,  42,  210,  0,  // row 21
+  157, 175, 67,   0,
+  0,   17,  33,   0,
+  80,  43,  81,   0,
+  91,  75,  81,   0,
+  156, 166, 40,   0,
+  10,  122, 23,   0,
+  238,  13,  11,  0,
+
+  222,  63,  0, // row 22
+  20,   52,  0,
+  0,    4,   0,
+  49,   1,   0,
+  54,   132, 0,
+  18,   163, 0,
+  202,  126, 0,
+  195,  44,  0,
+
+  23,   235, 238,  0, // row 23
+  106,  86,  95,   0,
+  0,    75,  158,  0,
+  156,  54,  134,  0,
+  68,   115, 56,   0,
+  110,  132, 150,  0,
+  52,   170, 13,   0,
+  5,    94,  111,  0,
+
+  46,  139,  8,    0, // row 24
+  182, 153,  64,   0,
+  0,   69,   87,   0,
+  153, 88,   63,   0,
+  30,  42,   101,  0,
+  113, 108,  61,   0,
+  113, 161,  88,   0,
+  81,  19,   130,  0,
+
+  228, 156,  0,  // row 25
+  45,  21,   0,
+  0,   65,   0,
+  211, 94,   0,
+  128, 63,   0,
+  72,  136,  0,
+  197, 194,  0,
+  66,  95,   0,
+
+  29,  143,  160,  122, 0, // row 26
+  67,  137,  55,   85,  0,
+  0,   100,  13,   7,   0,
+  90,  6,    221,  6,   0,
+  142, 28,   100,  133, 0,
+  36,  38,   53,   145, 0,
+  164, 172,  49,   161, 0,
+  146, 66,   190,  86,  0,
+
+  8,   151,  0, // row 27
+  103, 50,   0,
+  0,   32,   0,
+  27,  118,  0,
+  13,  10,   0,
+  42,  104,  0,
+  168, 193,  0,
+  64,  181,  0,
+
+  98,  101,  135,  0, // row 28
+  70,  111,  168,  0,
+  0,   126,  110,  0,
+  216, 212,  193,  0,
+  106, 77,   43,   0,
+  64,  24,   149,  0,
+  14,  186,  46,   0,
+  7,   144,  16,   0,
+
+  18,  28,  0, // row 29
+  110, 17,  0,
+  0,   154, 0,
+  108, 61,  0,
+  133, 25,  0,
+  139, 161, 0,
+  50,  27,  0,
+  25,  57,  0,
+
+  71,  240, 9,   84,  0, // row 30
+  120, 154, 52,  56,  0,
+  0,   35,  51,  134, 0,
+  106, 44,  185, 176, 0,
+  87,  56,  104, 70,  0,
+  84,  173, 93,  29,  0,
+  70,  17,  50,  6,   0,
+  37,  139, 221, 17,  0,
+
+  106, 1,   0, // row 31
+  3,   170, 0,
+  0,   20,  0,
+  147, 182, 0,
+  80,  139, 0,
+  117, 148, 0,
+  115, 189, 0,
+  201, 46,  0,
+
+  242,  44,  166,  0, // row 32
+  84,   8,   17,   0,
+  0,    20,  122,  0,
+  108,  21,  110,  0,
+  32,   89,  71,   0,
+  116,  73,  142,  0,
+  110,  0,   163,  0,
+  179,  14,  116,  0,
+
+  132,  164,  235,  0, // row 33
+  165,  179,  124,  0,
+  0,    88,   13,   0,
+  71,   12,   109,  0,
+  135,  6,    2,    0,
+  105,  137,  29,   0,
+  163,  173,  179,  0,
+  46,   2,    106,  0,
+
+  147,  85,   36,   0, // row 34
+  173,  177,  12,   0,
+  0,    19,   78,   0,
+  29,   201,  69,   0,
+  37,   25,   114,  0,
+  11,   41,   162,  0,
+  197,  191,  193,  0,
+  184,  135,  141,  0,
+
+  57,   40,   63,  0, // row 35
+  77,   184,  18,  0,
+  0,    157,  6,   0,
+  91,   165,  55,  0,
+  60,   137,  93,  0,
+  126,  152,  172, 0,
+  157,  167,  181, 0,
+  85,   225,  175, 0,
+
+  140,  38,  154,  0, // row 36
+  25,  151,  170,  0,
+  0,   63,   82,   0,
+  1,   175,  83,   0,
+  121, 129,  26,   0,
+  73,  154,  129,  0,
+  197, 167,  179,  0,
+  178, 112,  106,  0,
+
+  219,  151,  0, // row 37
+  37,   31,   0,
+  0,    144,  0,
+  40,   12,   0,
+  97,   56,   0,
+  167,  38,   0,
+  181,  193,  0,
+  154,  114,  0,
+
+  31,  66,   38,  0,  // row 38
+  84,  151,  190, 0,
+  0,   93,   19,  0,
+  37,  97,   46,  0,
+  1,   70,   1,   0,
+  112, 7,   19,   0,
+  157, 173, 191,  0,
+  42,  41,  105,  0,
+
+  239,  172,  34,  0, // row 39
+  93,   132,  57,  0,
+  0,    24,   138, 0,
+  106,  181,  154, 0,
+  119,  32,   142, 0,
+  109,  6,    105, 0,
+  181,  157,  173, 0,
+  167,  45,   189, 0,
+
+  0,    75,   120, 0, // row 40
+  103,  107,  163, 0,
+  0,    36,   143,  0,
+  98,   35,   36,  0,
+  6,    73,   102, 0,
+  160,  156,  82,  0,
+  193,  163,  179, 0,
+  78,   67,   180, 0,
+
+  129,  229,  118, 0, // row 41
+  147,  7,    60,  0,
+  0,    2,    55,  0,
+  120,  101,  81,  0,
+  48,   47,   19,  0,
+  132,  6,    8,   0,
+  191,  197,  167, 0,
+  53,   215,  230, 0
+};
+
 // clang-format on
 
 inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
diff --git a/test/UpperPHY/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp
index b10c743..7c53b9b 100644
--- a/test/UpperPHY/Turbo/turbo_test_data.hpp
+++ b/test/UpperPHY/Turbo/turbo_test_data.hpp
@@ -5,11 +5,8 @@
 #pragma once
 
 #include "rng.hpp"
-
-#ifdef ARMRAL_ARCH_HWY
-#include <string.h>
-#endif
 #include <map>
+#include <string.h>
 
 static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
   static armral::utils::linear_congruential_generator lcg;
@@ -22,17 +19,6 @@ static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
   // round k/8 up to the next multiple of 8 and divide by 8
   auto len = (k_bytes + 7) / 8;
 
-#ifndef ARMRAL_ARCH_HWY
-  for (uint32_t i = 0; i < len; ++i) {
-    uint32_t val32_0 = lcg.one<uint32_t>(&state);
-    uint32_t val32_1 = lcg.one<uint32_t>(&state);
-    uint32x2_t val32_2 = {val32_0, val32_1};
-    uint8x8_t val8_8 = vreinterpret_u8_u32(val32_2);
-    for (uint32_t j = 0; j < 8 && (i * 8) + j < k_bytes; ++j) {
-      src[(i * 8) + j] = val8_8[j];
-    }
-  }
-#else
   for (uint32_t i = 0; i < len; ++i) {
     uint8_t val8_8[8];
     uint32_t val32_0 = lcg.one<uint32_t>(&state);
@@ -43,7 +29,6 @@ static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) {
       src[(i * 8) + j] = val8_8[j];
     }
   }
-#endif
 }
 
 // The valid values of k for Turbo coding, from TS36.212
-- 
GitLab