diff --git a/.gitlab/merge_request_templates/Bugfix.md b/.gitlab/merge_request_templates/Bugfix.md
index e7bce9ba05de1ef04d06cc4c072ce72cda0eb2f4..9af681edcb2c9f4771f8cab809377a12c45c069c 100644
--- a/.gitlab/merge_request_templates/Bugfix.md
+++ b/.gitlab/merge_request_templates/Bugfix.md
@@ -11,6 +11,7 @@ If an [Issue](https://gitlab.arm.com/networking/ral/-/issues) already exists for
 * [] ["Unreleased" section of the Changelog updated](https://gitlab.arm.com/networking/ral/-/blob/main/CHANGELOG.md#unreleased)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 * [] Commit message includes information on how to reproduce the issue(s)
 * [] [Tests added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-tests)
 * [] [Tests pass when run with AddressSanitizer](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md?ref_type=heads#user-content-testing-with-addresssanitizer)
diff --git a/.gitlab/merge_request_templates/Default.md b/.gitlab/merge_request_templates/Default.md
index 7e12eddf190a0e6aae4d5bc03acca22a34754550..ced68b50cc32d4c8404cf6bf0e8485613f362d11 100644
--- a/.gitlab/merge_request_templates/Default.md
+++ b/.gitlab/merge_request_templates/Default.md
@@ -15,6 +15,7 @@ If this Merge Request addresses an [Issue](https://gitlab.arm.com/networking/ral
 * [] ["Unreleased" section of the Changelog updated](https://gitlab.arm.com/networking/ral/-/blob/main/CHANGELOG.md#unreleased)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 * [] [Tests added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-tests)
 * [] [Tests pass when run with AddressSanitizer](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md?ref_type=heads#user-content-testing-with-addresssanitizer)
 * [] [Benchmarks added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-benchmarks)
diff --git a/.gitlab/merge_request_templates/Documentation.md b/.gitlab/merge_request_templates/Documentation.md
index ab2485305a5ffe4233143bb71ab77f426fa778fc..8803932a0ace4b4b13b4718f4eaf0ea07861a0db 100644
--- a/.gitlab/merge_request_templates/Documentation.md
+++ b/.gitlab/merge_request_templates/Documentation.md
@@ -13,5 +13,6 @@ If this Merge Request addresses an [Issue](https://gitlab.arm.com/networking/ral
 * [] [`make docs` target runs successfully](https://gitlab.arm.com/networking/ral/-/blob/main/README.md?ref_type=heads#user-content-documentation)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 
 For any items that are not checked, please provide details.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7959c2d0b242a4958cdd2e0198e4c9152d90d855..b5085a6b51743ca5fa10517f0d250b6ffb7ffd70 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,11 +8,6 @@ documented in this file.
 ### Added
 
 ### Changed
-- Moved `license_terms/BSD-3-Clause.txt` and
-`license_terms/third_party_licenses.txt` to
-[LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and
-[THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md)
-respectively.
 
 ### Deprecated
 
@@ -22,6 +17,30 @@ respectively.
 
 ### Security
 
+## [24.04] - 2024-04-19
+
+### Added
+- Makefile target `bench_excel_summary` to run the benchmarks and create an
+Excel spreadsheet containing the results.
+
+### Changed
+- Moved `license_terms/BSD-3-Clause.txt` and
+`license_terms/third_party_licenses.txt` to
+[LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and
+[THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md)
+respectively.
+
+- Extended `armral_cmplx_pseudo_inverse_direct_f32` and
+`armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized
+pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for the case
+where `M` and/or `N` == 1.
+
+- Improved SVE2 performance of `armral_turbo_decode_block` and
+`armral_turbo_decode_block_noalloc`.
+
+- Improved SVE2 performance of `armral_ldpc_encode_block` and
+`armral_ldpc_encode_block_noalloc`.
+
 ## [24.01] - 2024-01-19
 
 ### Changed
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 243d864e786bf0aed228e97c92009cc7225adab2..47894de53f8718dacdf7dc3d51dc910a82225f99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,165 +1,184 @@
 cmake_minimum_required(VERSION 3.3)
-project(armral VERSION 24.01)
+project(armral VERSION 24.04)
 
 if(CMAKE_VERSION VERSION_GREATER 3.4)
-  # stop CMake from automatically adding -rdynamic to linker flags
-  # because the semihosting toolchain does not understand that flag
+  # Stop CMake from automatically adding -rdynamic to linker flags because it
+  # causes a warning about unused compiler options when using Clang
   cmake_policy(SET CMP0065 NEW)
 endif()
 
 # set default build type if none was specified with -DCMAKE_BUILD_TYPE=...
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting build type to RELEASE as none was specified.")
-  set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE)
+  set(CMAKE_BUILD_TYPE
+      RELEASE
+      CACHE STRING "Choose the type of build." FORCE)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release")
 endif()
 
-option(ARMRAL_ENABLE_WERROR "Enable -Werror when building the library and tests" OFF)
-option(ARMRAL_ENABLE_ASAN "Enable AddressSanitizer when building the library and tests" OFF)
-option(ARMRAL_ENABLE_EFENCE "Enable Electric Fence when building the library and tests" OFF)
-option(ARMRAL_ENABLE_COVERAGE "Enable instrumentation for generating code coverage" OFF)
+option(ARMRAL_ENABLE_WERROR
+       "Enable -Werror when building the library and tests" OFF)
+option(ARMRAL_ENABLE_ASAN
+       "Enable AddressSanitizer when building the library and tests" OFF)
+option(ARMRAL_ENABLE_EFENCE
+       "Enable Electric Fence when building the library and tests" OFF)
+option(ARMRAL_ENABLE_COVERAGE
+       "Enable instrumentation for generating code coverage" OFF)
 option(BUILD_SIMULATION "Enable building channel simulation programs" ON)
-set(ARMRAL_ARCH NEON CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
+set(ARMRAL_ARCH
+    NEON
+    CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
 set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2")
 
 set(ARMRAL_LIB_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/SVD/arm_svd.cpp
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
 
 # Per source file compiler flag overrides/additions
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # Enable -frename-registers for GCC release builds of arm_cmplx_mat_mult_ahb_f32.c.
-  # This improves register allocation for SVE instructions such as the indexed FMLA,
-  # which have a restricted range for the indexed operand. A patch to improve register
-  # allocation has been accepted upstream (see below) and will probably be part of GCC 14,
-  # but it is unlikely to be backported to any previous releases.
+  # Enable -frename-registers for GCC release builds of
+  # arm_cmplx_mat_mult_ahb_f32.c. This improves register allocation for SVE
+  # instructions such as the indexed FMLA, which have a restricted range for the
+  # indexed operand. A patch to improve register allocation has been accepted
+  # upstream (see below) and will probably be part of GCC 14, but it is unlikely
+  # to be backported to any previous releases.
   #
-  # See: https://github.com/gcc-mirror/gcc/commit/6d25ea520f7ed58568c9a0031409bc8e38b673f3
-  # Note: We don't universally enable this flag, as in some cases it can cause regressions.
-  set_property(SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
-    APPEND PROPERTY COMPILE_OPTIONS $<$<AND:$<CONFIG:RELEASE>,$<C_COMPILER_ID:GNU>>:-frename-registers>)
+  # See:
+  # https://github.com/gcc-mirror/gcc/commit/6d25ea520f7ed58568c9a0031409bc8e38b673f3
+  # Note: We don't universally enable this flag, as in some cases it can cause
+  # regressions.
+  set_property(
+    SOURCE
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+    APPEND
+    PROPERTY COMPILE_OPTIONS
+             $<$<AND:$<CONFIG:RELEASE>,$<C_COMPILER_ID:GNU>>:-frename-registers>
+  )
 
   if(ARMRAL_ENABLE_WERROR)
-    # Disable warnings-as-errors about C-style Variable Length Arrays in FFT source when using Clang++
-    set_property(SOURCE
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-      APPEND PROPERTY COMPILE_OPTIONS $<$<CXX_COMPILER_ID:Clang>:-Wno-error=vla-extension>)
+    # Disable warnings-as-errors about C-style Variable Length Arrays in FFT
+    # source when using Clang++
+    set_property(
+      SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+             ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+             ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+      APPEND
+      PROPERTY COMPILE_OPTIONS
+               $<$<CXX_COMPILER_ID:Clang>:-Wno-error=vla-extension>)
   endif()
 endif()
 
-set(ARMRAL_UTIL_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils/rng.cpp
-)
+set(ARMRAL_UTIL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/utils/rng.cpp)
 
 set(ARMRAL_LIB_INC
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/src
+    ${CMAKE_CURRENT_SOURCE_DIR}/utils)
 
 set(ARMRAL_TEST_INC
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo)
 
 set(ARMRAL_OVERRIDE_COMPILE_FLAGS FALSE)
 if(NOT CMAKE_C_FLAGS STREQUAL "")
-    if(CMAKE_CXX_FLAGS STREQUAL "")
-        message(FATAL_ERROR "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set")
-    endif()
+  if(CMAKE_CXX_FLAGS STREQUAL "")
+    message(
+      FATAL_ERROR
+        "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set"
+    )
+  endif()
 endif()
 
 if(NOT CMAKE_CXX_FLAGS STREQUAL "")
-    if(CMAKE_C_FLAGS STREQUAL "")
-        message(FATAL_ERROR "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set")
-    endif()
-    set(ARMRAL_OVERRIDE_COMPILE_FLAGS TRUE)
+  if(CMAKE_C_FLAGS STREQUAL "")
+    message(
+      FATAL_ERROR
+        "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set"
+    )
+  endif()
+  set(ARMRAL_OVERRIDE_COMPILE_FLAGS TRUE)
 endif()
 
 set(ARMRAL_COMPILER_FLAGS "")
@@ -167,7 +186,10 @@ set(ARMRAL_LINKER_FLAGS "")
 
 if(ARMRAL_ENABLE_WERROR)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_WERROR")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_WERROR"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -Werror)
   endif()
@@ -175,7 +197,10 @@ endif()
 
 if(ARMRAL_ENABLE_ASAN)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_ASAN")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_ASAN"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -fsanitize=address)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -fsanitize=address)
@@ -184,7 +209,10 @@ endif()
 
 if(ARMRAL_ENABLE_EFENCE)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_EFENCE")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_EFENCE"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -lefence)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -lefence)
@@ -193,16 +221,24 @@ endif()
 
 if(ARMRAL_ENABLE_COVERAGE)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_COVERAGE")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_COVERAGE"
+    )
   else()
-    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage -fprofile-update=atomic)
-    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage -fprofile-update=atomic)
+    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage
+                              -fprofile-update=atomic)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage
+                            -fprofile-update=atomic)
   endif()
 endif()
 
 if(ARMRAL_SEMIHOSTING)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_SEMIHOSTING")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_SEMIHOSTING"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --specs=rdimon.specs)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -lc -lrdimon)
@@ -214,29 +250,39 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
 endif()
 
 if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # If the optimisation flags are already set, don't try and guess what they
+  # If the optimization flags are already set, don't try and guess what they
   # should be.
   if(ARMRAL_ARCH STREQUAL "SVE2")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+sve2+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.5-a+sve2+crypto+fp16"
+        CACHE INTERNAL "")
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+sve+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.2-a+sve+crypto+fp16"
+        CACHE INTERNAL "")
   elseif(ARMRAL_ARCH STREQUAL "NEON")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8-a+crypto"
+        CACHE INTERNAL "")
   else()
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # We explicitly set the optimisation flags, so just copy those. We still need to
-  # set the appropriate SVE version definition
+  # We explicitly set the optimization flags, so just copy those. We still need
+  # to set the appropriate SVE version definition
   set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
   if(ARMRAL_ARCH STREQUAL "SVE2")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
   elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 else()
   set(ARMRAL_ARCH_COMPILE_OPTIONS "")
@@ -245,19 +291,32 @@ else()
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
   elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 endif()
 
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   set(ARMRAL_COMPILER_FLAGS
-    ${ARMRAL_COMPILER_FLAGS}
-    $<$<COMPILE_LANGUAGE:C>:-Wshadow -Wall -Wcast-qual>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17>
-    $<$<CONFIG:DEBUG>:-Og -g3 -ggdb -fno-omit-frame-pointer>)
+      ${ARMRAL_COMPILER_FLAGS}
+      $<$<COMPILE_LANGUAGE:C>:-Wshadow
+      -Wall
+      -Wcast-qual>
+      $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+      -Wall
+      -Wcast-qual
+      -fno-rtti
+      -fno-exceptions
+      -std=c++17>
+      $<$<CONFIG:DEBUG>:-Og
+      -g3
+      -ggdb
+      -fno-omit-frame-pointer>)
   message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}")
 else()
-  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the compile line
+  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the
+  # compile line
   message(STATUS "Overriding compilation flags with manually set flags")
   message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
   message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
@@ -268,26 +327,31 @@ endif()
 add_library(armral ${ARMRAL_LIB_SOURCES})
 target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
-target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} ${ARMRAL_COMPILER_FLAGS})
+target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                      ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
 
 add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
 target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
-target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} ${ARMRAL_COMPILER_FLAGS})
+target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                            ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS})
 
 if(ARMRAL_SEMIHOSTING)
-  # when semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a
-  # compiler flag, so we specify the string "ARMRAL_SEMIHOSTING"
-  # rather than the CMake variable ARMRAL_SEMIHOSTING
+  # when semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag,
+  # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable
+  # ARMRAL_SEMIHOSTING
   target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING")
   target_compile_definitions(armral_utils PUBLIC "ARMRAL_SEMIHOSTING")
 endif()
 
 install(TARGETS armral DESTINATION lib)
-install(DIRECTORY include/ DESTINATION include
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY include/
+  DESTINATION include
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES LICENSE.md THIRD_PARTY_LICENSES.md
         DESTINATION share/licenses/armral)
 
@@ -305,26 +369,36 @@ if(BUILD_TESTING)
     set(BENCHMARKER_RUNNER "${BENCHMARKER_SOURCE_DIR}/bench/default_runner.py")
   endif()
 
-  add_custom_target(check
-                    COMMAND ${CMAKE_CTEST_COMMAND}
-                    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
-  add_custom_target(bench
-                    COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-                      ${CMAKE_SOURCE_DIR}
-                      ${BENCHMARKER_BUILD_DIR}
-                      --runner ${BENCHMARKER_RUNNER}
-                    ${JOB_POOL_CONSOLE}
-                    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
-
-  add_custom_target(bench_concurrent
-                    COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-                      ${CMAKE_SOURCE_DIR}
-                      ${BENCHMARKER_BUILD_DIR}
-                      --runner ${BENCHMARKER_RUNNER}
-                      --concurrent
-                    ${JOB_POOL_CONSOLE}
-                    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+  add_custom_target(
+    check
+    COMMAND ${CMAKE_CTEST_COMMAND}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_custom_target(
+    bench
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER}
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_concurrent
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_excel_summary
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent | tee
+      ${BENCHMARKER_BUILD_DIR}/out.json
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py
+            ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
 
   set(ARMRAL_TEST_LINK_LIBRARIES armral armral_utils)
 
@@ -334,232 +408,365 @@ if(BUILD_TESTING)
 
   # utility function to add a test
   function(add_armral_test TEST_NAME TEST_SOURCE)
-    get_filename_component(TEST_DIR ${TEST_SOURCE} DIRECTORY)
-
     # build the actual test executable itself
     add_executable(${TEST_NAME} ${TEST_SOURCE})
-    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS})
+    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
     target_include_directories(${TEST_NAME} PRIVATE ${ARMRAL_TEST_INC})
-    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS} ${ARMRAL_ARCH_COMPILE_OPTIONS})
+    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}
+                                                ${ARMRAL_ARCH_COMPILE_OPTIONS})
 
     # register it as a test, set up dependencies
-    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
+    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER}
+                                       ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
     add_dependencies(check ${TEST_NAME})
   endfunction()
 
   # utility function to add a benchmark
   function(add_armral_bench BENCH_NAME BENCH_SOURCE)
-    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
 
     # build the actual bench executable itself
     add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE})
-    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS})
+    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
     target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC})
     target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS})
 
     # register it as a benchmark, set up dependencies
     add_dependencies(bench bench_${BENCH_NAME})
     add_dependencies(bench_concurrent bench_${BENCH_NAME})
+    add_dependencies(bench_excel_summary bench_${BENCH_NAME})
 
     # add target for running the benchmark
-    add_custom_target(run_bench_${BENCH_NAME}
-      COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-      ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR}
-      ${BENCHMARKER_BUILD_DIR}
-      --runner ${BENCHMARKER_RUNNER}
-      --concurrent
-      ${JOB_POOL_CONSOLE}
+    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
+    add_custom_target(
+      run_bench_${BENCH_NAME}
+      COMMAND
+        ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR} ${BENCHMARKER_BUILD_DIR}
+        --runner ${BENCHMARKER_RUNNER} --concurrent ${JOB_POOL_CONSOLE}
       WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}
       DEPENDS bench_${BENCH_NAME})
-endfunction()
-
-  add_armral_test(fft_cf32 test/FFT/cf32/main.cpp)
-  add_armral_test(fft_cs16 test/FFT/cs16/main.cpp)
-  add_armral_test(arm_fir_filter_cf32 test/FIR/arm_fir_filter_cf32/main.cpp)
-  add_armral_test(arm_fir_filter_cf32_decimate_2 test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp)
-  add_armral_test(arm_fir_filter_cs16 test/FIR/arm_fir_filter_cs16/main.cpp)
-  add_armral_test(arm_fir_filter_cs16_decimate_2 test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp)
-  add_armral_test(modulation test/Modulation/main.cpp)
-  add_armral_test(demodulation test/Demodulation/main.cpp)
-  add_armral_test(mu_law_compression test/MuLaw/Compression/main.cpp)
-  add_armral_test(mu_law_decompression test/MuLaw/Decompression/main.cpp)
-  add_armral_test(vec_dot_16_32_bit test/VectorDotProd/vecDot16_32bit/main.cpp)
-  add_armral_test(vec_dot_16_2 test/VectorDotProd/vecDot16_2/main.cpp)
-  add_armral_test(vec_dot_32 test/VectorDotProd/vecDot32/main.cpp)
-  add_armral_test(vec_dot_32_2 test/VectorDotProd/vecDot32_2/main.cpp)
-  add_armral_test(vec_dot_16_2_32_bit test/VectorDotProd/vecDot16_2_32bit/main.cpp)
-  add_armral_test(vec_dot_16 test/VectorDotProd/vecDot16/main.cpp)
-  add_armral_test(crc test/CRC/main.cpp)
-  add_armral_test(matrix_mult_16 test/MatrixMult/single/MatrixMult16/main.cpp)
-  add_armral_test(arm_solve test/MatrixMult/batch/ArmSolve/main.cpp)
-  add_armral_test(matrix_mult_32 test/MatrixMult/single/MatrixMult32/main.cpp)
-  add_armral_test(matrix_mult_aah_32 test/MatrixMult/single/MatrixMultAAH32/main.cpp)
-  add_armral_test(matrix_mult_ahb_32 test/MatrixMult/single/MatrixMultAHB32/main.cpp)
-  add_armral_test(vec_mul_32_2 test/ElemWiseVectorMult/vecMul32_2/main.cpp)
-  add_armral_test(vec_mul_16 test/ElemWiseVectorMult/vecMul16/main.cpp)
-  add_armral_test(vec_mul_32 test/ElemWiseVectorMult/vecMul32/main.cpp)
-  add_armral_test(vec_mul_16_2 test/ElemWiseVectorMult/vecMul16_2/main.cpp)
-  add_armral_test(polar_decoder test/Polar/decoding/main.cpp)
-  add_armral_test(polar_encoder test/Polar/encoding/main.cpp)
-  add_armral_test(polar_frozen_mask test/Polar/frozen/main.cpp)
-  add_armral_test(polar_rate_matching test/Polar/rate_matching/main.cpp)
-  add_armral_test(polar_rate_recovery test/Polar/rate_recovery/main.cpp)
-  add_armral_test(polar_subchannel_interleave test/Polar/subchannel_interleave/main.cpp)
-  add_armral_test(polar_subchannel_deinterleave test/Polar/subchannel_deinterleave/main.cpp)
-  add_armral_test(polar_crc_attachment test/Polar/crc_attachment/main.cpp)
-  add_armral_test(block_scaling_compression test/ORanBlockScaling/Compression/main.cpp)
-  add_armral_test(block_scaling_decompression test/ORanBlockScaling/Decompression/main.cpp)
-  add_armral_test(block_float_compression test/XRanBlockFloat/Compression/main.cpp)
-  add_armral_test(block_float_decompression test/XRanBlockFloat/Decompression/main.cpp)
-  add_armral_test(correlation test/Correlation/main.cpp)
-  add_armral_test(matrix_inv_single test/MatrixInv/single/main.cpp)
-  add_armral_test(matrix_inv_batch test/MatrixInv/batch/main.cpp)
-  add_armral_test(matrix_pseudo_inv_direct test/MatrixPseudoInv/direct/main.cpp)
-  add_armral_test(seq_generator test/SeqGenerator/main.cpp)
-  add_armral_test(scrambling test/Scrambling/main.cpp)
-  add_armral_test(ldpc_encoding test/LDPC/encoding/main.cpp)
-  add_armral_test(ldpc_decoding test/LDPC/decoding/main.cpp)
-  add_armral_test(ldpc_rate_matching test/LDPC/rate_matching/main.cpp)
-  add_armral_test(ldpc_rate_recovery test/LDPC/rate_recovery/main.cpp)
-  add_armral_test(svd test/SVD/main.cpp)
-  add_armral_test(matrix_vector_mult_single_16 test/MatrixMult/single/MatrixVectorMult16/main.cpp)
-  add_armral_test(matrix_vector_mult_single_32 test/MatrixMult/single/MatrixVectorMult32/main.cpp)
-  add_armral_test(matrix_vector_mult_batch_16 test/MatrixMult/batch/MatrixVectorMult16/main.cpp)
-  add_armral_test(matrix_vector_mult_batch_32 test/MatrixMult/batch/MatrixVectorMult32/main.cpp)
-  add_armral_test(turbo_encoding test/Turbo/encoding/main.cpp)
-  add_armral_test(turbo_decoding test/Turbo/decoding/main.cpp)
-  add_armral_test(turbo_rate_matching test/Turbo/rate_matching/main.cpp)
-  add_armral_test(turbo_rate_recovery test/Turbo/rate_recovery/main.cpp)
-  add_armral_test(tail_biting_convolutional_encoding test/ConvCoding/encoding/main.cpp)
-  add_armral_test(tail_biting_convolutional_decoding test/ConvCoding/decoding/main.cpp)
-
-  add_armral_bench(correlation bench/Correlation/main.cpp)
-  add_armral_bench(crc_6_be bench/CRC/6/BigEndian/main.cpp)
-  add_armral_bench(crc_6_le bench/CRC/6/LittleEndian/main.cpp)
-  add_armral_bench(crc_11_be bench/CRC/11/BigEndian/main.cpp)
-  add_armral_bench(crc_11_le bench/CRC/11/LittleEndian/main.cpp)
-  add_armral_bench(crc_16_be bench/CRC/16/BigEndian/main.cpp)
-  add_armral_bench(crc_16_le bench/CRC/16/LittleEndian/main.cpp)
-  add_armral_bench(crc_24a_be bench/CRC/24/A/BigEndian/main.cpp)
-  add_armral_bench(crc_24a_le bench/CRC/24/A/LittleEndian/main.cpp)
-  add_armral_bench(crc_24b_be bench/CRC/24/B/BigEndian/main.cpp)
-  add_armral_bench(crc_24b_le bench/CRC/24/B/LittleEndian/main.cpp)
-  add_armral_bench(crc_24c_be bench/CRC/24/C/BigEndian/main.cpp)
-  add_armral_bench(crc_24c_le bench/CRC/24/C/LittleEndian/main.cpp)
-  add_armral_bench(demodulation bench/Demodulation/main.cpp)
-  add_armral_bench(vec_mul_16 bench/ElemWiseVectorMult/VecMul16/main.cpp)
-  add_armral_bench(vec_mul_16_2 bench/ElemWiseVectorMult/VecMul16_2/main.cpp)
-  add_armral_bench(vec_mul_32 bench/ElemWiseVectorMult/VecMul32/main.cpp)
-  add_armral_bench(vec_mul_32_2 bench/ElemWiseVectorMult/VecMul32_2/main.cpp)
-  add_armral_bench(fft_cs16 bench/FFT/FFT16/main.cpp)
-  add_armral_bench(fft_cf32 bench/FFT/FFT32/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16 bench/FIR/FIR16/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16_decimate_2 bench/FIR/FIR16Decimate2/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32 bench/FIR/FIR32/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32_decimate_2 bench/FIR/FIR32Decimate2/main.cpp)
-  add_armral_bench(ldpc_decoding bench/LDPC/Decoding/main.cpp)
-  add_armral_bench(ldpc_encoding bench/LDPC/Encoding/main.cpp)
-  add_armral_bench(ldpc_rate_matching bench/LDPC/RateMatching/main.cpp)
-  add_armral_bench(ldpc_rate_recovery bench/LDPC/RateRecovery/main.cpp)
-  add_armral_bench(matrix_inv_single_general bench/MatrixInv/Single/GeneralMatInv/main.cpp)
-  add_armral_bench(matrix_inv_single_hermitian bench/MatrixInv/Single/HermitianMatInv/main.cpp)
-  add_armral_bench(matrix_inv_batch_general bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-  add_armral_bench(matrix_inv_batch_general_pa bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-  add_armral_bench(matrix_inv_batch_hermitian bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-  add_armral_bench(matrix_inv_batch_hermitian_pa bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-  add_armral_bench(matrix_pseudo_inv_direct bench/MatrixPseudoInv/Direct/main.cpp)
-  add_armral_bench(arm_solve_1x2 bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-  add_armral_bench(arm_solve_1x4 bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-  add_armral_bench(arm_solve_2x2 bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-  add_armral_bench(arm_solve_2x4 bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-  add_armral_bench(arm_solve_4x4 bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-  add_armral_bench(matrix_mult_i16_32b bench/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-  add_armral_bench(matrix_mult_i16_64b bench/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-  add_armral_bench(matrix_mult_f32_2x2 bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_2x2_iq bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_4x4 bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_4x4_iq bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_general bench/MatrixMult/Single/MatrixMult32/general/main.cpp)
-  add_armral_bench(matrix_mult_ahb_32 bench/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-  add_armral_bench(matrix_mult_aah_32 bench/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-  add_armral_bench(modulation bench/Modulation/main.cpp)
-  add_armral_bench(mu_law_compression_8bit bench/MuLaw/Compression/8bit/main.cpp)
-  add_armral_bench(mu_law_compression_9bit bench/MuLaw/Compression/9bit/main.cpp)
-  add_armral_bench(mu_law_compression_14bit bench/MuLaw/Compression/14bit/main.cpp)
-  add_armral_bench(mu_law_decompression_8bit bench/MuLaw/Decompression/8bit/main.cpp)
-  add_armral_bench(mu_law_decompression_9bit bench/MuLaw/Decompression/9bit/main.cpp)
-  add_armral_bench(mu_law_decompression_14bit bench/MuLaw/Decompression/14bit/main.cpp)
-  add_armral_bench(block_scaling_compression_8bit bench/ORanBlockScaling/Compression/8bit/main.cpp)
-  add_armral_bench(block_scaling_compression_9bit bench/ORanBlockScaling/Compression/9bit/main.cpp)
-  add_armral_bench(block_scaling_compression_14bit bench/ORanBlockScaling/Compression/14bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_8bit bench/ORanBlockScaling/Decompression/8bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_9bit bench/ORanBlockScaling/Decompression/9bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_14bit bench/ORanBlockScaling/Decompression/14bit/main.cpp)
-  add_armral_bench(block_float_compression_8bit bench/XRanBlockFloat/Compression/8bit/main.cpp)
-  add_armral_bench(block_float_compression_9bit bench/XRanBlockFloat/Compression/9bit/main.cpp)
-  add_armral_bench(block_float_compression_12bit bench/XRanBlockFloat/Compression/12bit/main.cpp)
-  add_armral_bench(block_float_compression_14bit bench/XRanBlockFloat/Compression/14bit/main.cpp)
-  add_armral_bench(block_float_decompression_8bit bench/XRanBlockFloat/Decompression/8bit/main.cpp)
-  add_armral_bench(block_float_decompression_9bit bench/XRanBlockFloat/Decompression/9bit/main.cpp)
-  add_armral_bench(block_float_decompression_12bit bench/XRanBlockFloat/Decompression/12bit/main.cpp)
-  add_armral_bench(block_float_decompression_14bit bench/XRanBlockFloat/Decompression/14bit/main.cpp)
-  add_armral_bench(polar_decoder bench/Polar/Decoding/main.cpp)
-  add_armral_bench(polar_encoder bench/Polar/Encoding/main.cpp)
-  add_armral_bench(polar_frozen_mask bench/Polar/Frozen/main.cpp)
-  add_armral_bench(polar_rate_matching bench/Polar/RateMatching/main.cpp)
-  add_armral_bench(polar_rate_recovery bench/Polar/RateRecovery/main.cpp)
-  add_armral_bench(polar_subchannel_deinterleave bench/Polar/SubchannelDeinterleave/main.cpp)
-  add_armral_bench(polar_subchannel_interleave bench/Polar/SubchannelInterleave/main.cpp)
-  add_armral_bench(seq_generator bench/SeqGenerator/main.cpp)
-  add_armral_bench(scrambling bench/Scrambling/main.cpp)
-  add_armral_bench(svd bench/SVD/main.cpp)
-  add_armral_bench(vec_dot_16 bench/VectorDotProd/VecDot16/main.cpp)
-  add_armral_bench(vec_dot_16_2 bench/VectorDotProd/VecDot16_2/main.cpp)
-  add_armral_bench(vec_dot_16_2_32_bit bench/VectorDotProd/VecDot16_2_32bit/main.cpp)
-  add_armral_bench(vec_dot_16_32_bit bench/VectorDotProd/VecDot16_32bit/main.cpp)
-  add_armral_bench(vec_dot_32 bench/VectorDotProd/VecDot32/main.cpp)
-  add_armral_bench(vec_dot_32_2 bench/VectorDotProd/VecDot32_2/main.cpp)
-  add_armral_bench(matrix_vector_mult_i16_32b bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-  add_armral_bench(matrix_vector_mult_i16_64b bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-  add_armral_bench(matrix_vector_mult_32 bench/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_32b bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_32b_pa bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_64b bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_64b_pa bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_f32 bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_f32_pa bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-  add_armral_bench(turbo_encoding bench/Turbo/Encoding/main.cpp)
-  add_armral_bench(turbo_decoding bench/Turbo/Decoding/main.cpp)
-  add_armral_bench(turbo_rate_matching bench/Turbo/RateMatching/main.cpp)
-  add_armral_bench(turbo_rate_recovery bench/Turbo/RateRecovery/main.cpp)
-  add_armral_bench(tail_biting_convolutional_encoding bench/ConvCoding/Encoding/main.cpp)
-  add_armral_bench(tail_biting_convolutional_decoding bench/ConvCoding/Decoding/main.cpp)
+  endfunction()
+
+  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+  add_armral_test(arm_solve
+                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_16
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_32
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_mult_16
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+  add_armral_test(matrix_mult_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+  add_armral_test(matrix_mult_aah_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_test(matrix_mult_ahb_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_16
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_32
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_pseudo_inv_direct
+                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_test(vec_dot_16_2
+                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_test(vec_dot_16_2_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_test(vec_dot_16_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_test(vec_dot_32_2
+                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_test(mu_law_compression
+                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+  add_armral_test(mu_law_decompression
+                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+  add_armral_test(block_float_compression
+                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+  add_armral_test(block_float_decompression
+                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+  add_armral_test(block_scaling_compression
+                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+  add_armral_test(block_scaling_decompression
+                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_test(arm_fir_filter_cs16_decimate_2
+                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_test(arm_fir_filter_cf32_decimate_2
+                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+  add_armral_test(tail_biting_convolutional_decoding
+                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_test(tail_biting_convolutional_encoding
+                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+  add_armral_test(polar_crc_attachment
+                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_test(polar_subchannel_deinterleave
+                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_test(polar_subchannel_interleave
+                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+  add_armral_bench(
+    matrix_inv_batch_general
+    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+  add_armral_bench(matrix_inv_batch_general_pa
+                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian_pa
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+  add_armral_bench(matrix_inv_single_general
+                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+  add_armral_bench(matrix_inv_single_hermitian
+                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+  add_armral_bench(arm_solve_1x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+  add_armral_bench(arm_solve_1x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+  add_armral_bench(arm_solve_2x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+  add_armral_bench(arm_solve_2x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+  add_armral_bench(arm_solve_4x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_general
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+  add_armral_bench(
+    matrix_mult_aah_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_bench(
+    matrix_mult_ahb_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_bench(matrix_pseudo_inv_direct
+                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_bench(vec_dot_16
+                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_bench(vec_dot_16_2
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_bench(vec_dot_16_2_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_bench(vec_dot_16_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_bench(vec_dot_32
+                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_bench(vec_dot_32_2
+                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_bench(vec_mul_16_2
+                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_bench(vec_mul_32_2
+                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_bench(mu_law_compression_14bit
+                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+  add_armral_bench(mu_law_compression_8bit
+                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+  add_armral_bench(mu_law_compression_9bit
+                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+  add_armral_bench(mu_law_decompression_14bit
+                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+  add_armral_bench(mu_law_decompression_8bit
+                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+  add_armral_bench(mu_law_decompression_9bit
+                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+  add_armral_bench(block_float_compression_8bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+  add_armral_bench(block_float_compression_9bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_8bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_9bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16_decimate_2
+                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32_decimate_2
+                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+  add_armral_bench(tail_biting_convolutional_decoding
+                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_bench(tail_biting_convolutional_encoding
+                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_bench(polar_rate_matching
+                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_bench(polar_rate_recovery
+                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_bench(polar_subchannel_deinterleave
+                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_bench(polar_subchannel_interleave
+                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_bench(turbo_rate_matching
+                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_bench(turbo_rate_recovery
+                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
 endif()
 
 if(BUILD_EXAMPLES)
-  add_custom_target(make_examples_dir ALL
-                    COMMAND ${CMAKE_COMMAND} -E make_directory examples)
+  add_custom_target(make_examples_dir ALL COMMAND ${CMAKE_COMMAND} -E
+                                                  make_directory examples)
   add_custom_target(examples)
   add_custom_target(run_examples)
   add_dependencies(run_examples examples)
 
-  # Any parameters after the first one will be passed as parameters
-  # to the example executable when running it
+  # Any parameters after the first one will be passed as parameters to the
+  # example executable when running it
   function(add_armral_example EXAMPLE_SOURCE)
     get_filename_component(EXAMPLE_EXE ${EXAMPLE_SOURCE} NAME_WE)
     add_executable(${EXAMPLE_EXE} ${EXAMPLE_SOURCE})
     add_dependencies(${EXAMPLE_EXE} make_examples_dir)
     set(EXAMPLE_OUTPUT_NAME examples/${EXAMPLE_EXE})
-    set_target_properties(${EXAMPLE_EXE}
-                          PROPERTIES
-                          OUTPUT_NAME ${EXAMPLE_OUTPUT_NAME})
+    set_target_properties(${EXAMPLE_EXE} PROPERTIES OUTPUT_NAME
+                                                    ${EXAMPLE_OUTPUT_NAME})
 
     target_link_libraries(${EXAMPLE_EXE} armral m)
 
-    add_custom_target(run_${EXAMPLE_EXE}
-                      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
-                      DEPENDS ${EXAMPLE_EXE}
-    )
+    add_custom_target(
+      run_${EXAMPLE_EXE}
+      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
+      DEPENDS ${EXAMPLE_EXE})
     add_dependencies(examples ${EXAMPLE_EXE})
     add_dependencies(run_examples run_${EXAMPLE_EXE})
   endfunction()
@@ -571,44 +778,50 @@ if(BUILD_EXAMPLES)
 endif()
 
 if(BUILD_SIMULATION)
-  # Include simulation rules and targets
-  # This involves building dependencies like AWGN library and OpenMP
+  # Include simulation rules and targets This involves building dependencies
+  # like AWGN library and OpenMP
   add_subdirectory(simulation)
 endif()
 
 find_package(Doxygen)
-if (DOXYGEN_FOUND)
+if(DOXYGEN_FOUND)
   set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)
   set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
   configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY)
   add_custom_target(docs COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT})
 endif()
 
-# uninstall target
+# Create target to uninstall the library
 if(NOT TARGET uninstall)
   configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
-    IMMEDIATE @ONLY)
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
 
-  add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+  add_custom_target(
+    uninstall COMMAND ${CMAKE_COMMAND} -P
+                      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
 endif()
 
-
 # Check that the C and C++ compilers are from the same toolchain
-if (NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
-  message(FATAL_ERROR "CXX and C compiler providers differ. Please specify the same compiler toolchain")
+if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
+  message(
+    FATAL_ERROR
+      "CXX and C compiler providers differ. Please specify the same compiler toolchain"
+  )
 endif()
 
-set (COMP_ERR_MSG "Compilation is only supported with GNU versions 7, 8, 9, 10, \
+set(COMP_ERR_MSG
+    "Compilation is only supported with GNU versions 7, 8, 9, 10, \
                   11, 12, 13, or Clang versions greater than or equal to 12.0.1. \
-                  If compilation fails please use one of the supported compilers.")
-if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION VERSION_GREATER 13.2)
+                  If compilation fails please use one of the supported compilers."
+)
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION
+                                                  VERSION_GREATER 13.2)
     message(WARNING ${COMP_ERR_MSG})
   endif()
-elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 12.0.1)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 12.0.1)
     message(WARNING ${COMP_ERR_MSG})
   endif()
 else()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 53a8a3013a88e2792e3afb713898db589f695019..d4b42d1c18cb629f603fa1dad80f6647bb4ed949 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,7 +1,7 @@
-# Contributing to Arm RAN Acceleration Library (Arm RAL)
+# Contributing to Arm RAN Acceleration Library (ArmRAL)
 
 Describes the requirements for contributing code to Arm RAN
-Acceleration Library (Arm RAL):
+Acceleration Library (ArmRAL):
 
 - The license;
 - How to write and submit patches;
@@ -47,13 +47,15 @@ any relevant reasoning.
 
 ## Function naming
 
-Arm RAL functions are named according to:
+ArmRAL functions are named according to:
 
-    armral_<algorithm>_<precision>{_variant}
+```
+armral_<algorithm>_<precision>{_variant}
+```
 
 where:
 
-- *algorithm* is a word or words that summarises the main purpose of
+- *algorithm* is a word or words that summarizes the main purpose of
   the function;
 
 - *precision* indicates the working precision of the internals of the
@@ -62,13 +64,13 @@ where:
 
   For Fast Fourier Transform (FFT) functions use:
 
-  - `cf32`: complex 32-bit floating point;
-  - `cs16`: complex signed 16-bit integer.
+   - `cf32`: complex 32-bit floating point;
+   - `cs16`: complex signed 16-bit integer.
 
   For all other functions use:
 
-  - `f32`: 32-bit floating point;
-  - `i16`: signed 16-bit integer.
+   - `f32`: 32-bit floating point;
+   - `i16`: signed 16-bit integer.
 
 - *variant* is an optional suffix to distinguish different
   implementations of the same *algorithm* at the same *precision*.
@@ -82,9 +84,9 @@ Examples from the library:
 `armral_cmplx_mat_mult_2x2_f32_iq` | Complex-valued 2x2 matrix multiplication | 32-bit floating point         | Separate I and Q arrays
 `armral_cmplx_vecdot_i16_32bit`    | Complex-valued vector dot-product        | signed 16-bit integer         | 32-bit accumulator
 
-## Directory structure
+## ArmRAL directory structure
 
-The directory structure of Arm RAL is:
+The directory structure of ArmRAL is:
 
 ```
 +-- CMakeLists.txt
@@ -93,9 +95,12 @@ The directory structure of Arm RAL is:
 +-- RELEASE_NOTES.md
 +-- THIRD_PARTY_LICENSES.md
 +-- bench
-|   +-- CRC
-|       +-- bench.py
-|       +-- main.cpp
+|   +-- BasicMathFun
+|      +-- MatrixInv
+|         +-- Single
+|            +-- GeneralMatInv
+|               +-- bench.py
+|               +-- main.cpp
 |   +-- ...
 +-- docs
 |   +-- ...
@@ -113,8 +118,10 @@ The directory structure of Arm RAL is:
 |       +-- ...
 |   +-- ...
 +-- test
-|   +-- CRC
-|       +-- main.cpp
+|   +-- BasicMathFun
+|      +-- MatrixInv
+|         +-- Single
+|            +-- main.cpp
 |   +-- ...
 +-- utils
 |   +-- ...
@@ -142,13 +149,13 @@ the custom allocators defined in `src/utils/allocators.hpp`. These
 offer two advantages:
 
 1. Developers do not need to ensure dynamically-allocated memory is
-freed after use.
+   freed after use.
 
 2. All user-facing functions (defined in `include/armral.h`) that need
-to allocate memory internally must also provide a non-allocating
-version that allows users to pass in a pre-allocated buffer. Using Arm
-RAL's custom allocators simplifies writing these variants because they
-offer a counting allocator in addition to one that uses `malloc`.
+   to allocate memory internally must also provide a non-allocating
+   version that allows users to pass in a pre-allocated buffer. Using
+   ArmRAL's custom allocators simplifies writing these variants because they
+   offer a counting allocator in addition to one that uses `malloc`.
 
 C-style variable length arrays (VLAs) can only be used in the FFT
 functions (`armral/src/LowerPHY/FFT`).
@@ -156,7 +163,7 @@ functions (`armral/src/LowerPHY/FFT`).
 ### Namespaces
 
 All symbols in the library must be clearly identified as coming from
-Arm RAL. User-facing functions specified in `include/armral.h` are
+ArmRAL. User-facing functions specified in `include/armral.h` are
 identified by the prefix `armral_`. Using C++ enables us to enclose
 other library functions in namespaces. These namespaces must begin
 with `armral::` and can themselves contain further namespaces to
@@ -168,7 +175,7 @@ the `static` keyword.
 
 ### No dependency on C++ standard library at runtime
 
-We require that Arm RAL does not have a dependency on the C++ runtime
+We require that ArmRAL does not have a dependency on the C++ runtime
 library as this enables `libarmral` to be linked against on systems
 that do not have the C++ runtime library installed. This means that
 constructs like `std::vector` must not be used by functions in the
@@ -181,7 +188,7 @@ constructs in testing and benchmarking code: for example,
 
 Documentation for each user-facing function is written as a Doxygen
 comment immediately preceding the function's prototype in
-`include/armral.h`. Arm RAL uses the Javadoc style, which is a C-style
+`include/armral.h`. ArmRAL uses the Javadoc style, which is a C-style
 multi-line comment that starts with `/**`:
 
 ```c
@@ -252,7 +259,7 @@ C/C++ code style is maintained through the use of `clang-format` and
 patch; instructions on how to run these tools are given below.
 
 `clang-format` and `clang-tidy` are part of the [LLVM
-Project](https://llvm.org/). Arm RAL is tested with version 17.0.0 of
+Project](https://llvm.org/). ArmRAL is tested with version 17.0.0 of
 the tools.
 
 Matching your coding style as close as possible to the `clang-tidy`
@@ -262,26 +269,28 @@ enforce:
 - Use snake case for names of variables and functions,
   i.e. `this_is_a_variable` instead of `thisIsAVariable`.
 
--  Symbol names start with a lower case letter. This means that `_m`
-   for a member variable, for example, will not be accepted.
+- Symbol names start with a lower case letter. This means that `_m`
+  for a member variable, for example, will not be accepted.
 
--  Always use curly braces for single line `if` statements, `for` loops
-   and `while` loops.
+- Always use curly braces for single line `if` statements, `for` loops
+  and `while` loops.
 
--  Opening curly braces for `if` statements, `for` loops and `while`
-   loops are on the same line as the `if`, `for` or `while`.
+- Opening curly braces for `if` statements, `for` loops and `while`
+  loops are on the same line as the `if`, `for` or `while`.
 
--  Closing curly braces are the first non-white-space character on a
-   new line. Their alignment must match the first character of the
-   matching `if`/`for`/`while` statement. `else` statements are on the
-   same line as a closing curly brace for the corresponding `if` or `else
-   if` statement.
+- Closing curly braces are the first non-white-space character on a
+  new line. Their alignment must match the first character of the
+  matching `if`/`for`/`while` statement. `else` statements are on the
+  same line as a closing curly brace for the corresponding `if` or `else
+  if` statement.
 
 ### Running clang-format
 
 Run `clang-format` on the current commit with:
 
-    git clang-format HEAD~
+```
+git clang-format HEAD~
+```
 
 This will correctly format any files modified in the current
 commit. You must then update your commit with the reformatted files.
@@ -292,10 +301,12 @@ Before running `clang-tidy` you must compile the library with an LLVM
 compiler, i.e. `clang` and `clang++`, and tell CMake to write out the
 compilation commands by setting `-DCMAKE_EXPORT_COMPILE_COMMANDS=On`:
 
-    mkdir <build>
-    cd <build>
-    cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DBUILD_TESTING=On <path>
-    make
+```
+mkdir <build>
+cd <build>
+cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DBUILD_TESTING=On <path>
+make
+```
 
 Substituting:
 
@@ -306,8 +317,10 @@ Substituting:
 
 Then run `clang-tidy` with a list of files to check:
 
-    cd <build>
-    clang-tidy -p <build> <file_1> <file_2> ... <file_N> -header-filter=.*
+```
+cd <build>
+clang-tidy -p <build> <file_1> <file_2> ... <file_N> -header-filter=.*
+```
 
 where `<file_X>` is the path to a modified file in the library
 source. Fix any errors and update your commit with the modified files.
@@ -317,11 +330,15 @@ source. Fix any errors and update your commit with the modified files.
 Python code style is maintained through the use of the `flake8`
 linter. Install `flake8` using `pip`:
 
-    pip install flake8
+```
+pip install flake8
+```
 
 and run it on an individual Python file:
 
-    python -m flake8 --config=<path>/flake8.txt <filename>
+```
+python -m flake8 --config=<path>/flake8.txt <filename>
+```
 
 Where:
 
@@ -333,6 +350,24 @@ This will produce a list of errors, which you must fix manually. Once
 you have rerun `flake8` and it does not report any errors, add your
 updated Python file to the current commit.
 
+## CMake code style
+
+CMake code style is maintained through the use of the `cmake-format`
+tool. Install `cmake-format` using `pip`:
+
+```
+pip install cmake-format
+```
+
+and run it on an individual `CMakeLists.txt` file:
+
+```
+cmake-format -i CMakeLists.txt
+```
+
+This will correctly format the specified file. You must then update
+your commit with the reformatted file.
+
 ## Writing tests
 
 Each function with a prototype in `armral.h` must be accompanied by a
@@ -342,7 +377,7 @@ preferably a separate reimplementation of the function. In some
 situations it may be necessary to compare against arrays of constant
 values instead but this should be avoided wherever possible.
 
-Arm RAL tests must exercise every path through the function that leads
+ArmRAL tests must exercise every path through the function that leads
 to a successful exit. Setting the CMake variable
 `ARMRAL_ENABLE_COVERAGE=On` enables the compiler flags needed to
 visualize code coverage with [gcovr](https://gcovr.com/en/stable/).
@@ -356,8 +391,10 @@ In the top-level `CMakeLists.txt` add an `add_armral_test()` entry
 pointing to the source file for the tests. The source-code for the
 test must be placed in a subdirectory of `<path>/test`, where `<path>`
 is the root directory of the library source. Usually the source for
-all the tests of a single Arm RAL function is contained in a single
-`main.cpp` file.
+all the tests of a single ArmRAL function is contained in a single
+`main.cpp` file. The tests should be added to `CmakeLists.txt` in
+alphabetical order grouped by the directories, e.g. all UpperPHY
+tests are grouped together in alphabetical order.
 
 Successful tests must return `EXIT_SUCCESS` from the `main()`
 function; failing tests must return `EXIT_FAILURE`.
@@ -368,7 +405,7 @@ It is recommended to use
 [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html)
 to test your patches for memory errors as patches will not be accepted
 unless this passes. Setting the CMake variable `ARMRAL_ENABLE_ASAN=On`
-enables the flags needed to compile and link Arm RAL and its tests
+enables the flags needed to compile and link ArmRAL and its tests
 with AddressSanitizer. The `make check` target will then run the tests
 using AddressSanitizer and will fail if an error is detected.
 
@@ -415,7 +452,7 @@ The following code block provides a template for the `bench.py` script.
 ```py
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
@@ -450,20 +487,20 @@ Items in angle brackets `< >` are changed as appropriate according to the
 following descriptions.
 
 - `<bench_exe_name>`: The name of the executable, e.g.
-`bench_mu_law_compression_8bit` (see [Naming scheme](#naming-scheme)).
+  `bench_mu_law_compression_8bit` (see [Naming scheme](#naming-scheme)).
 
 - `<number of reps>`: The number of times the case should be run for
-profiling (see [Number of repetitions](#number-of-repetitions)).
+  profiling (see [Number of repetitions](#number-of-repetitions)).
 
 - `<list of argument values>`: The arguments that will be required in order
-to run the function that is to be benchmarked. This can be a list of individual
-elements, or can, for example, be a list of tuples if multiple arguments are
-required for each case. The length of the list determines how many cases are
-generated. See [Number of cases](#number-of-cases) for guidance on how many
-cases there should be.
+  to run the function that is to be benchmarked. This can be a list of individual
+  elements, or can, for example, be a list of tuples if multiple arguments are
+  required for each case. The length of the list determines how many cases are
+  generated. See [Number of cases](#number-of-cases) for guidance on how many
+  cases there should be.
 
 - `<function>`: A snake case string to identify the function being
-benchmarked for a particular case, e.g. `mu_law_compression_8bit`.
+  benchmarked for a particular case, e.g. `mu_law_compression_8bit`.
 
 - `<arg0, arg1, ...>`: The arguments in the argument list.
 
@@ -481,7 +518,7 @@ The following code block provides a basic template.
 ```cpp
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -535,40 +572,40 @@ The items in angle brackets `< >` are changed as appropriate according to the
 following descriptions.
 
 - `<run_function_perf>`: The name of the function that repeatedly calls the
-function being benchmarked, e.g. `run_mu_law_compression_8bit_perf` (see
-[Naming scheme](#naming-scheme)).
+  function being benchmarked, e.g. `run_mu_law_compression_8bit_perf` (see
+  [Naming scheme](#naming-scheme)).
 
 - `<type_arg0>`, `<type_arg1>`: The types of the arguments which are passed
-in on the command line.
+  in on the command line.
 
 - `<FUNC DESCRIPTOR>`: An uppercase string to identify the function, e.g.
-`"MU LAW COMPRESSION 8BIT"`.
+  `"MU LAW COMPRESSION 8BIT"`.
 
 - `<arg0_description>`, `<arg1_description>`: Descriptions to identify the
-arguments when printing.
+  arguments when printing.
 
 - `<arg0_specifier>`, `<arg1_specifier>`: The format specifiers for printing
-the arguments.
+  the arguments.
 
 - `<type_var0>`, `<type_var1>`: The types of the variables defined locally
-in `<run_function_perf>`.
+  in `<run_function_perf>`.
 
 - `<var0>`, `<var1>`: The names of variables defined locally in
-`<run_function_perf>`.
+  `<run_function_perf>`.
 
 - `<armral_func>`: The name of the library function being benchmarked (e.g.
-`armral_mu_law_compr_8bit`).
+  `armral_mu_law_compr_8bit`).
 
 - `<num_args>`: The number of arguments which are passed to the executable
-on the command line. This is equal to the number of arguments in the `args`
-field of the JSON object + 1 (since the filename is the first argument).
+  on the command line. This is equal to the number of arguments in the `args`
+  field of the JSON object + 1 (since the filename is the first argument).
 
 - `<arg0>`, `<arg1>`: The names of the arguments which are passed to the
-executable on the command line. These are the names of the arguments provided
-in the `args` field of the JSON object generated by `bench.py`.
+  executable on the command line. These are the names of the arguments provided
+  in the `args` field of the JSON object generated by `bench.py`.
 
 - `<description of arg0>`, `<description of arg1>`: A description of each
-command line argument.
+  command line argument.
 
 ##### Outputs
 
@@ -587,7 +624,9 @@ Once the new `main.cpp` file has been created, an entry must be added to
 
 where `<name>` is the `exe_name` without `bench_` at the front (e.g.
 `mu_law_compression_8bit`). The entry goes with the other benchmark
-entries as part of the `if(BUILD_TESTING)` logic.
+entries as part of the `if(BUILD_TESTING)` logic. The benchmarks should be
+added in alphabetical order to `CMakeLists.txt` grouped by the directories,
+e.g. all UpperPHY benchmarks are grouped together in alphabetical order.
 
 #### Directory structure
 
@@ -595,7 +634,7 @@ Benchmarks for different functions should be separated into different
 files. For example, for Mu Law compression and decompression there are
 different functions for 8-bit, 9-bit and 14-bit (de)compression. These
 should be in separate benchmarking executables. The Mu Law directory
-structure in `bench` therefore looks like:
+structure in `bench/DuRuInterface` therefore looks like:
 
 ```
 +-- MuLaw
diff --git a/CREDITS.md b/CREDITS.md
index 467a1b112b05d8f43c911b32c9338d75d6402d1f..0271d77b8ea30bc4da7ed29edf1cb040d2e08f64 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -5,33 +5,31 @@ Acceleration Library:
 - Work on `armral_ldpc_rate_recovery` to correctly set the
   log-likelihood ratios of filler bits was contributed upstream by
   4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/6.
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/6>.
 
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support the addition and removal of filler bits when the soft
   buffer size is less than the full buffer size was contributed
   upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/5.
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/5>.
 
 - Work on `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
   `armral_ldpc_rate_recovery` to support the addition and removal of
   filler bits when the code block size is not a multiple of lifting
   set size was contributed upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/4
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/4>.
 
 - Work on `armral_seq_generator` to extend the `sequence_len`
   parameter to `uint32_t` was contributed upstream by 4g5g
   Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/3
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/3>.
 
 - Work on `armral_polar_rate_matching` and
   `armral_polar_rate_recovery` to enable or disable bit interleaving
   was contributed upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/2
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/2>.
 
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support soft buffer sizes was contributed upstream by 4g5g
   Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/1
-
-
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
diff --git a/Doxyfile.in b/Doxyfile.in
index c470dc932f4e2c8e67c67dadad76e8e432d00a32..f571d32cda510d0f33a63e15719233f6e1710031 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Arm RAN Acceleration Library Reference Guide"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.01"
+PROJECT_NUMBER         = "24.04"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/LICENSE.md b/LICENSE.md
index 10ce6d47a47a0a67137b55ce616bd64256f4856d..e511299cc09fd0ca2dc106e40a455371dabe087a 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/README.md b/README.md
index bcaea5e108a2fc3409f09a8397663cfececc4f51..83a9a05d25e77989591674760d67fba93aaa2063 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,25 @@
 # Get started with Arm RAN Acceleration Library (ArmRAL)
 
-Describes how to build, install, run tests and benchmarks, and uninstall Arm RAN
-Acceleration Library (ArmRAL).
+This document describes how to build, install, run tests and
+benchmarks, and uninstall Arm RAN Acceleration Library (ArmRAL).
 
-# Before you begin
+## Introducing Arm RAN Acceleration Library
 
-If you have not already downloaded Arm RAN Acceleration library, visit
-https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download
-to download the source code.
+Arm RAN Acceleration Library provides optimized signal processing and related
+maths functions for enabling 5G Radio Access Network (RAN) deployments. It
+leverages the efficient vector units available on Arm cores that support the
+Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads,
+including:
+
+* Matrix and vector arithmetic, such as matrix multiplication.
+* Fast Fourier Transforms (FFTs).
+* Digital modulation and demodulation.
+* Cyclic Redundancy Check (CRC).
+* Encoding and decoding schemes, including Polar, Low-Density Parity
+  Check (LDPC), and Turbo.
+* Compression and decompression.
+
+## Before you begin
 
 * Ensure you have installed all the tools listed in the **Tools** section of the
   `RELEASE_NOTES.md` file.
@@ -16,7 +28,7 @@ to download the source code.
   the PMULL extension, pmull is listed under the **Features** list given in the
   `/proc/cpuinfo` file.
 
-# Build Arm RAN Acceleration Library (ArmRAL)
+## Build Arm RAN Acceleration Library (ArmRAL)
 
 1. Configure your environment. If you have multiple compilers installed on your
    machine, you can set the `CC` and `CXX` environment variables to the path to
@@ -56,14 +68,14 @@ to download the source code.
 
    Notes:
 
-   * The `-DBUILD_TESTING=On` and `-DBUILD_EXAMPLES=On` options are optional,
-     but are required if you want to run the library tests (`-DBUILD_TESTING`)
-     and benchmarks (`-DBUILD_EXAMPLES`).
+   * The `-DBUILD_TESTING=On` and `-DBUILD_EXAMPLES=On` options are required
+     if you want to run the library tests (`-DBUILD_TESTING`) and benchmarks
+     (`-DBUILD_EXAMPLES`).
 
-   * The `-DCMAKE_INSTALL_PREFIX=<install-dir>` option is optional and
-     specifies the base directory used to install the library. The library
-     archive is installed to `<install-dir>/lib` and headers are installed to
-     `<install-dir>/include`. The default location is `/usr/local`.
+   * The `-DCMAKE_INSTALL_PREFIX=<install-dir>` option specifies the base
+     directory used to install the library. The library archive is installed to
+     `<install-dir>/lib` and headers are installed to `<install-dir>/include`.
+     The default location is `/usr/local`.
 
    * By default, a static library is built. To build a dynamic or a static
      library use the `-DBUILD_SHARED_LIBS={On|Off}` option.
@@ -214,7 +226,7 @@ to download the source code.
 
        Default is `On`.
 
-# Install Arm RAN Acceleration Library (ArmRAL)
+## Install Arm RAN Acceleration Library (ArmRAL)
 
 After you have built Arm RAN Acceleration Library, you can install the library.
 
@@ -235,7 +247,7 @@ After you have built Arm RAN Acceleration Library, you can install the library.
    directory. `install_manifest.txt` lists the installation locations for the
    library and the header files.
 
-# Run the tests
+## Run the tests
 
 The Arm RAN Acceleration Library package includes tests for the available
 functions in the library.
@@ -247,9 +259,6 @@ To build and run the tests, use:
 
     make check
 
-The tests run and test the available functions in the library. Testing
-times vary from system to system, but typically only take a few seconds.
-
 If you are not developing on an AArch64 machine, or if you want to test the SVE
 or SVE2 version of the library on an AArch64 machine that does not support the
 extension, you can use the `-DARMRAL_TEST_RUNNER` option to prefix each test
@@ -260,7 +269,7 @@ prefix the tests with `qemu-aarch64` using:
     cmake .. -DBUILD_TESTING=On -DARMRAL_TEST_RUNNER=qemu-aarch64
     make check
 
-# Run the benchmarks
+## Run the benchmarks
 
 All the functions in Arm RAN Acceleration Library contain benchmarking code
 that contains preset problem sizes.
@@ -274,10 +283,21 @@ To build and run the benchmarks, use:
 
     make bench
 
-Benchmark results print as JSON objects. To further process the results, you
-can collect the results to a file or pipe the results into other scripts.
+Benchmark results print as JSON objects. To further process the results, you can
+collect the results to a file or pipe the results into other scripts.
+Alternatively, the Makefile target:
+
+    make bench_excel_summary
+
+will run the benchmarks and produce an Excel spreadsheet of the results, in
+addition to printing them as JSON objects. To install the required Python
+packages for this target, use:
+
+    pip install -r <path>/python/requirements.txt
+
+where `<path>` is the path to the root directory of the library source.
 
-# Run the examples
+## Run the examples
 
 The source for the example programs is available in the `examples` directory,
 found in the ArmRAL root directory.
@@ -301,7 +321,7 @@ More information about the examples that are available in Arm RAN Acceleration
 Library, and how to use the library in general, is available in
 **Use Arm RAN Acceleration Library (ArmRAL)** (see `examples.md`).
 
-# Run the simulations
+## Run the simulations
 
 You can evaluate the quality of the error correction of the different encoding schemes
 against the signal-to-noise ratio using a set of noisy channel simulation
@@ -331,11 +351,11 @@ directory.
 More information about the simulation programs that are available in Arm RAN
 Acceleration Library is available in `simulation/README.md`.
 
-# Code coverage
+## Code coverage
 
 You can generate information that describes how much of the library is used by
 your application, or is covered by the included tests. To collect code coverage
-information, you must have built Arm RAN Acceleration Library  with
+information, you must have built Arm RAN Acceleration Library with
 `-DARMRAL_ENABLE_COVERAGE=On`.
 
 An example workflow could be:
@@ -361,11 +381,11 @@ update to a newer version of `gcovr`. To find out what versions of `gcovr` have
 been tested with ArmRAL, see the **Tools** section of the `RELEASE_NOTES.md`
 file.
 
-# Documentation
+## Documentation
 
 The Arm RAN Acceleration Library Reference Guide is available online at:
 
-    https://developer.arm.com/documentation/102249/2401
+    https://developer.arm.com/documentation/102249/2404
 
 If you have Doxygen installed on your system, you can build a local HTML version
 of the Arm RAN Acceleration Library documentation using CMake.
@@ -377,7 +397,7 @@ To build the documentation, run:
 The HTML builds and is output to `docs/html/`. To view the documentation, open
 the `index.html` file in a browser.
 
-# Uninstall Arm RAN Acceleration Library
+## Uninstall Arm RAN Acceleration Library
 
 To uninstall Arm RAN Acceleration Library:
 
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index eee0c452959f142c143bca707b0a9ee720240b92..7830b73a84ff23a43e0d50416a059e033a146992 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,4 +1,4 @@
-# Arm RAN Acceleration Library 24.01 Release Note
+# Arm RAN Acceleration Library 24.04 Release Note
 
 Non-Confidential
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
@@ -7,7 +7,7 @@ Arm conventions and proprietary notices, including confidentiality status,
 terminology statement, and product release status, can be found at the end of
 this document.
 
-# Contents
+## Contents
 
 - Release overview
 - Release contents
@@ -16,38 +16,32 @@ this document.
 - Conventions
 - Proprietary notices
 
-# Release overview
+## Release overview
 
 The following sections describe the product that this release note describes and
 its quality status at time of release.
 
-Use of Arm RAN Acceleration Library is subject to a BSD-3-Clause license, the
-text of which can be found in the `LICENSE.md` file in your product
-installation. We will receive inbound contributions under the same license.
-
-## Product description
+### Product description
 
 The Arm RAN Acceleration Library (ArmRAL) contains a set of functions for
 accelerating telecommunications applications such as, but not limited to, 5G
-Radio Access Networks (RANs).
-
-The Arm RAN Acceleration Library 24.01 package provides a library that is
-optimized for Arm AArch64-based processors.
+Radio Access Networks (RANs). These functions are optimized for Arm AArch64-based
+processors.
 
 Arm RAN Acceleration Library provides:
 
 - Vector functions
 - Matrix functions
-- Lower PHY support functions
-- Upper PHY support functions
-- DU-RU Interface support functions
+- Lower physical layer (Lower PHY) support functions
+- Upper physical layer (Upper PHY) support functions
+- Distributed Unit-Radio Unit (DU-RU) Interface support functions
 
 Arm RAN Acceleration Library includes functions that operate on 16-bit signed
-integers and 32-bit floating-point values.
+integers and 16-bit and 32-bit floating-point values.
 
-## Release Status
+### Release status
 
-This is the 24.01 release of Arm RAN Acceleration Library.
+This is the 24.04 release of Arm RAN Acceleration Library.
 
 These deliverables are being released under the terms of the agreement between
 Arm and each licensee (the "Agreement"). All planned verification and
@@ -55,7 +49,7 @@ validation is complete.
 
 The release is suitable for volume production under the terms of the Agreement.
 
-## Licensing information
+### Licensing information
 
 Use of Arm RAN Acceleration Library is subject to a BSD-3-Clause license, the
 text of which can be found in the `LICENSE.md` file in your product
@@ -64,56 +58,35 @@ installation. We will receive inbound contributions under the same license.
 If you require a different license than BSD-3-Clause for compatibility with
 your end product, please get in contact.
 
-# Release contents
+## Release contents
 
 Arm RAN Acceleration Library releases contain documentation and source files.
 
 The following subsections describe:
 
-- Downloading and unpacking the product.
+- Cloning the product's git repository from Arm's Gitlab.
 - The contents of this release.
 - Any changes since the previous release.
 - Any known issues and limitations that exist at the time of this release.
 
-## Downloading and unpacking
-
-You can either clone the source as a git repository from Arm's Gitlab,
-or you can download Arm RAN Acceleration Library as a tarball of
-source from the Arm Developer website and then unpack the contents.
-
-**To clone the Arm RAN Acceleration Library repository via SSH:**
-
-    git clone git@git.gitlab.arm.com:networking/ral.git
-
-**To clone the Arm RAN Acceleration Library repository via HTTPS:**
-
-    git clone https://git.gitlab.arm.com/networking/ral.git
+### Cloning the source repository
 
-**To download the tarball and unpack the contents:**
+**To obtain the 24.04 release of Arm RAN Acceleration Library by cloning
+  the repository via HTTPS:**
 
-1. Go to https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download.
+    git clone -b armral-24.04 https://git.gitlab.arm.com/networking/ral
 
-2. Complete the form and click **Submit**. The package downloads.
-
-3. Locate the downloaded .tar.gz file.
-
-4. Copy the .tar.gz file to the directory where these files are to be built.
-
-5. Extract the tar file contents using a tar utility:
-
-    tar zxvf ral-armral-24.01.tar.gz
-
-## Deliverables
+### Deliverables
 
 The downloaded product includes the deliverables listed in this section.
 
-- Arm RAN Acceleration Library 24.01
+- Arm RAN Acceleration Library 24.04
 - Release Notes (this document)
 - Documentation
 
   Product documentation is available on the Arm Developer website at:
 
-    https://developer.arm.com/documentation/102249/2401
+    <https://developer.arm.com/documentation/102249/2404>
 
   **Note:** Documentation, errata and release notes might change between product
   releases. For the latest documentation bundle, check the product download
@@ -122,77 +95,67 @@ The downloaded product includes the deliverables listed in this section.
   **Note:** Arm tests its PDFs only in Adobe Acrobat and Acrobat Reader. Arm
   cannot guarantee the quality of this document when used with any other PDF
   reader. A suitable PDF reader can be downloaded from Adobe at
-  http://www.adobe.com.
+  <http://www.adobe.com>.
 
-## Differences from previous release
+### Differences from previous release
 
 The following subsections describe differences from the previous release of
 Arm RAN Acceleration Library.
 
-### Additions and functionality changes
+#### Additions and functionality changes
 
 Describes new features or any technical changes to features or
 components in this release.
 
-- Added support for the addition and removal of filler bits in
-  `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
-  `armral_ldpc_rate_recovery` when the code block size is not a
-  multiple of lifting set size or when the soft buffer size is less
-  than the full buffer size. This process is described in the 3GPP
-  Technical Specification (TS) 38.212. This work was contributed
-  upstream by 4g5g Consultants.
-
 - Extended `armral_cmplx_pseudo_inverse_direct_f32` and
   `armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the
-  regularized pseudo-inverse of a single complex 32-bit matrix of size
-  `M-by-N` for cases where `M > N` in addition to the cases where `M
-  <= N`.
+  regularized pseudo-inverse of a complex 32-bit matrix of size
+  `M-by-N` for the case where `M` and/or `N == 1`.
+
+- Added a Makefile target `bench_excel_summary` to run the benchmarks
+  and create an Excel spreadsheet containing the results.
 
-### Performance improvements
+#### Performance improvements
 
 Describes any features or components whose performance has improved in
 the current release compared with the previous release.
 
-- Performance improvements for the following routines:
-
-  * `armral_turbo_decode_block` and `armral_turbo_decode_block_noalloc`.
-
 - Performance improvements for SVE2 implementations of the following routines:
 
-  * `armral_seq_generator`, for the cases when `sequence_len` is not a
-    multiple of 64.
+   - `armral_turbo_decode_block` and
+     `armral_turbo_decode_block_noalloc`. These functions now operate
+     internally on 16-bit floating point values rather than 32-bit
+     floating point values.
+
+   - `armral_ldpc_encode_block` and
+     `armral_ldpc_encode_block_noalloc`.
 
-### Changes to simulation programs
+#### Changes to simulation programs
 
 Describes any changes, new features or components added to the channel
 simulation programs in this release.
 
-- Added support for the addition and removal of filler bits in
-  `ldpc_awgn` when the code block size is not a multiple of lifting
-  set size. This work was contributed upstream by 4g5g Consultants.
+- There are no changes to the channel simulation programs in this
+  release.
 
-### Resolved issues
+#### Resolved issues
 
 Describes any known issues resolved in the current release.
 
-- LDPC block encoding (`armral_ldpc_encode_block`), rate matching
-  (`armral_ldpc_rate_matching`) and rate recovery
-  (`armral_ldpc_rate_recovery`) now support the insertion and removal
-  of filler bits as described in the 3GPP Technical Specification (TS)
-  38.212.
+- There are no known issues resolved in this release.
 
-## Known limitations
+### Known limitations
 
 Describes any known limitations of the current release.
 
 - There are no known limitations in this release.
 
-# Support
+## Support
 
-If you have any issues with the installation, content or use of this release,
-raise a question on the Developer Community Forum:
+If you have any issues with the installation, content, or use of this
+release, raise a question on the Developer Community Forum:
 
-  https://community.arm.com/developer/f/infrastructure-solutions
+  <https://community.arm.com/developer/f/infrastructure-solutions>
 
 Arm will respond as soon as possible.
 
@@ -203,37 +166,35 @@ A Full release of the Arm Deliverable shall have met the contractual requirement
 for verification and validation of the deliverable subject to any waivers agreed
 between Arm and the Customer.
 
-## Tools
+### Tools
 
-The following points list the tools that are required to build or run Arm RAN
-Acceleration Library:
+To build or run Arm RAN Acceleration Library you will need:
 
-* A recent version of a C/C++ compiler, such as GCC. Arm RAN
-  Acceleration Library has been tested with GCC 7.5.0, 8.5.0, 9.5.0,
-  10.5.0, 11.4.0, 12.3.0, and 13.2.0.
+- A C/C++ compiler, such as GCC. Arm RAN Acceleration Library has been tested
+  with GCC 7.5.0, 8.5.0, 9.5.0, 10.5.0, 11.4.0, 12.3.0, and 13.2.0.
 
   **Note:** If you are cross-compiling, you need a cross-toolchain compiler that
   targets AArch64. You can download open-source cross-toolchain builds of the
   GCC compiler on the Arm Developer website:
 
-    https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+    <https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads>
 
   The variant to use for an AArch64 GNU/Linux target is
   `aarch64-none-linux-gnu`.
 
-* A recent version of CMake (version 3.3.0, or higher).
+- CMake version 3.3.0 or higher.
 
-In addition to the preceding requirements:
+Additionally:
 
-* To run the benchmarks, you must have the Linux utility tool `perf` installed
+- To run the benchmarks, you must have the Linux utility tool `perf` installed
   and a recent version of Python 3. Arm RAN Acceleration Library has been tested
   with Python 3.8.5.
 
-* To build a local version of the documentation, you must have Doxygen
+- To build a local version of the documentation, you must have Doxygen
   installed. Arm RAN Acceleration Library has been tested with Doxygen version
   1.8.13.
 
-* To generate code coverage HTML pages, you must have `gcovr` installed. The
+- To generate code coverage HTML pages, you must have `gcovr` installed. The
   library has been tested with `gcovr` version 4.2.
 
 **Note:** Arm RAN Acceleration Library runs on AArch64 cores, however
@@ -242,27 +203,27 @@ functions you must run on a core that supports the AArch64 PMULL
 extension. If your machine supports the PMULL extension, `pmull` is
 listed under the "Features" list given in the `/proc/cpuinfo` file.
 
-# Release history
+## Release history
 
 A full release history (with release notes) for Arm RAN Acceleration Library
 is available on the Arm Developer website:
 
-  https://developer.arm.com/downloads/-/arm-ran-acceleration-library/previous-releases-of-the-arm-ran-acceleration-library
+  <https://developer.arm.com/downloads/-/arm-ran-acceleration-library/previous-releases-of-the-arm-ran-acceleration-library>
 
-# Conventions
+## Conventions
 
 The following subsections describe conventions used in Arm documents.
 
-## Glossary
+### Glossary
 
 The Arm Glossary is a list of terms that are used in Arm documentation, together
 with definitions for those terms. The Arm Glossary does not contain terms that
 are industry standard unless the Arm meaning differs from the generally accepted
 meaning.
 
-See the Arm Glossary for more information: https://developer.arm.com/glossary.
+See the Arm Glossary for more information: <https://developer.arm.com/glossary>.
 
-# Non-Confidential Proprietary Notice
+## Non-Confidential Proprietary Notice
 
 This document is protected by copyright and other related rights and the
 practice or implementation of the information contained in this document may be
@@ -310,7 +271,7 @@ The Arm corporate logo and words marked with ® or ™ are registered trademarks
 trademarks of Arm Limited (or its affiliates) in the US and/or elsewhere. All
 rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm’s trademark usage
-guidelines at https://www.arm.com/company/policies/trademarks.
+guidelines at <https://www.arm.com/company/policies/trademarks>.
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
@@ -318,7 +279,7 @@ Arm Limited. Company 02557590 registered in England.
 110 Fulbourn Road, Cambridge, England CB1 9NJ.
 (LES-PRE-20349)
 
-## Confidentiality Status
+### Confidentiality Status
 
 This document is Non-Confidential. The right to use, copy and disclose this
 document may be subject to license restrictions in accordance with the terms of
@@ -327,15 +288,15 @@ to.
 
 Unrestricted Access is an Arm internal classification.
 
-## Product Status
+### Product Status
 
 The information in this document is Final, that is for a developed product.
 
-## Web Address
+### Web Address
 
-https://developer.arm.com
+<https://developer.arm.com>
 
-## Inclusive language commitment
+### Inclusive language commitment
 
 Arm values inclusive communities. Arm recognizes that we and our industry have
 used language that can be offensive. Arm strives to lead the industry and create
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
index 74414bb4b8f58e48287a6755259912394a5236ef..1770d5a640093f4d66f7802517dfdf9c2c2b3aaa 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
index 32847b488d83f24e9cb6a129565f2a6c7323e278..8375897a86249ac88a9808fe3c4d00601c57e38e 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
index aeaf28cae123b91b35480dfeac2a467a8f04ca07..51d5ad798490dda222bc2acd764ea3016e757753 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
index 0f27c83735e74eb14de3ff5ba090f9879d937d67..64a2b4f0a4f5ade087e734efc35dae1e2d9c7af5 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
index 8710c18caa683a98e31d02d18b8c874559468d00..2cb80568cc31ca4621b8a4f54e660b4f15453a56 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
similarity index 93%
rename from bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
index 8c66a87be3737bbaf2d176fb66b5f5668d5c969c..8d0e972ca269e6e608babf68b96a8cf48b0a4cb8 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
index eaf5b9baad7575f2aadcf34fb194099582784fb1..426c4940caa413317e69e5cb8ce4bd7eafa4ee0e 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
index 6536ea8615c61e3998787206b4c082e5a27bdd9e..cd8a7a6575bcde04b8ce158bbaa78e2898a80457 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/GeneralMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
similarity index 82%
rename from bench/MatrixInv/Single/GeneralMatInv/bench.py
rename to bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
index 3903a1ba46d75ba8586c4600f1dfca95c4be3f17..369ee50ac1d9f8ad9a2a641c51eb923b9502e9d3 100755
--- a/bench/MatrixInv/Single/GeneralMatInv/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/GeneralMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
similarity index 90%
rename from bench/MatrixInv/Single/GeneralMatInv/main.cpp
rename to bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
index 94e63e03c02f6244ea82601207f5b1c75d639728..2509ad536f06b799e0283d37161e701675d8f4b4 100644
--- a/bench/MatrixInv/Single/GeneralMatInv/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/HermitianMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
similarity index 82%
rename from bench/MatrixInv/Single/HermitianMatInv/bench.py
rename to bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
index f0d7e5b984adc201e291fb64f6bbd234d16c0e5d..af79e2eab00a6cb7ae085fd5188f12ab8758dd97 100755
--- a/bench/MatrixInv/Single/HermitianMatInv/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/HermitianMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
similarity index 91%
rename from bench/MatrixInv/Single/HermitianMatInv/main.cpp
rename to bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
index c9b708fb10089a1b9983b7e490bb6b82baea9786..de9d111e76de6aa7c975dca2a0623e50646b2f43 100644
--- a/bench/MatrixInv/Single/HermitianMatInv/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
index e7edca6043f31b7b778683ed6af9e3ba066f4d82..5fdd983a12fb6ef178cf34a663142a8371c526cb 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
index 9b0453fdd4972cbedbcd4b106839b303abc3c73e..96216c64d5974f469062e7cc489eeb0b1614367d 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
index d190973878d057c6fada1a644a7ce1f647f5f356..51e0638c6017ea44bc1954690963c2c7d29d4cee 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
index 1b8bd8bb43cd42a643d257714c6309d466bf1704..6c83b1aca123483c36f931cb60b0c2e1641cbdde 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
index aee26677a907174467751aa07697427b1ed8cdc8..4d8f7fbc081addb2370bf9b2038a6475b817f057 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
index b554c1f6cc9afac83face5d645b1256e914d9161..2fd1c770b758874b10cc71b83c391b7840864490 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
index 9dd2030a9fb5f54dff4f33335a6316bc1bf03440..6c0e4a84ff1ef519627f04d9b7adc234dd891954 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
index 7637055133e6d0609930518a556241af3d7f7d4b..24ab935cf63f79c428a1cc49ad13ce0667aac432 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
index ee929f25253f059b3c33c633b22713f766aa2b4a..1a73cc6a5fc870e58ec9157d3e80320c66a90872 100755
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
index 4082649fe53e702c312ff24587bfc7f06eba1859..20522897e17c983e11d3f761863b88d5b8d38b6e 100644
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
index 8549eab3111fd62bb699d0265d1adf297903f4e2..ae47493aa3905932b7db05280d38fd3a6ac1e581 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
index 6d0006fb206cc3fe7e6e31efce17a0d0664a4757..f29d40bd5645aff4b01cd4f6fc0d10d2afb12821 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
index 4ecaa289625f6c62357924f1fc31400272347936..726a7682447fd3abf0fd5c59f9b2ed69d7eb3ecf 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
index 85b3f96e50d6f9f4976af81ef813d0c7fa9b9df4..19d7cedcfa76bb73303cc48685b7e8a703bed481 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
index 621a7a03f1548fff14dc46b0c6abc55af209b26b..d42117e8d35f6606b6d3ce12cfea8914e8323acb 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
index 63034a6edeeeccfd892cbf5ec1c42bedeaf6de8e..0c987fc41f6cf9c28f0b2e08ea06dd22fcf5a6f5 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
index 5e88789dc2c415900cff9638b1e80a7535b793e9..d1dee616ea9fbc9575bc6d89aba5722c5efdfc70 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
index bdfbd196a6c8fe1d72e1a309477673510fef7f68..1df10870111a00429c0b9ef07b11182b692b2e99 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
index 0cfde51964b54252cd73f43e374bc7abd2c771f9..ca14a8682a2f0b06616001c806ee3742da8af545 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
index 3344fe6f1d20d43a2168e1e09528081bf898b894..3ea7a1cb4ab402e73fd5815c71b9a5c829f9940f 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
index 78f0bae758c3a12d07f56e3cabf51ca8c6631503..f10ae624629acb527344f1dc1bd2fbacfcc7031e 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
index 0aa6934ccc53fadaa2f415b14a0d76bbf64a015b..dee1b72b8703bf0e402975fc6e66c1177bd8c31c 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
similarity index 82%
rename from bench/MatrixMult/Single/MatrixMult16/32b/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
index 5e4312e3e1c5a32b28516a7628aa1a66376a89b8..94584d545b7c6fda386a3368de03b701883788b0 100755
--- a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
index 3462e61a4eb4882dd96ce3153ecde1f3b1c5919a..68baf3a792c7d399b113449f27ca68732d00b20b 100644
--- a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
similarity index 82%
rename from bench/MatrixMult/Single/MatrixMult16/64b/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
index ef4ad39039a7e0ff913ba0b51cf9599b98987978..0be0f7a2d96272a12e0062358bd0e7f05a986093 100755
--- a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
similarity index 90%
rename from bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
index a1ebda8082cdd30c7807909ca0a1cbe5bc76a5c7..098fc2680d9a924874b8e1f0c30d9740b43211df 100644
--- a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
index 6125bad8f7a505bbc0ba11a95240f6155fa4ac46..1ee075561c90cac508128595f631b06611ae17b8 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
index 890a174cafc7d40af027669750de6aab22b6982e..d3ceec3570956dd11b009beb58e20cc9a379b437 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
index f58fc63622325c39723f06b4fe925ac62bc80947..3cb7d5d19a1f3fe20e3705e8a4c0e9635106e109 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
index 74ab2b7a34a11ab9b234c9efaa2976651ac39ab2..060fe7e460b9ad34ade6ab607f43fbaaa0eb8952 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
index c7dd1f99e72633660a44f196b6790f052e40cdfc..e79186feef95840bd3a3b08c18a5266bd60cfc8e 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
index 3ccd0ceac7851cd82c62e8b136132ec1dccc29c9..abf84cdac525fac28303db6412578d092cf9ebe5 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
index 626b61804265975d5350f0ae0f9b35b85c7f09af..9c51504424a879e488a88a5be685fa505f1eb299 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
index d0eb8697ba70904b6d58f570ec845097ce1eb164..a73a0742ad0ba161f53e02f07f4cb42274bf16e6 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
similarity index 87%
rename from bench/MatrixMult/Single/MatrixMult32/general/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
index cae82ac76a655e1583f063665e260cfcdd154a51..f0c6ae0d2aaad8d3aff2f8f3c42b98cb6767963f 100755
--- a/bench/MatrixMult/Single/MatrixMult32/general/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
similarity index 93%
rename from bench/MatrixMult/Single/MatrixMult32/general/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
index f37000d52e0bdf7be1a9f5a004ef2c376d8eb1f2..a2f6657f64a35f59f6e0e696c52c062bbdaad56f 100644
--- a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
similarity index 85%
rename from bench/MatrixMult/Single/MatrixMultAAH32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
index c911c26f9c15597c660fa2a1c85b5af8cd349cb5..1dca6b0446f4c44d0f08839166a94ab87f092ea1 100755
--- a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
index 5ebdcf156611b84d3eba5c7184b091a0d0080443..949c8791a8c9d9d056e418ab3183c5196df42c22 100644
--- a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMultAHB32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
index 9a58a3d056aca8e7ffffd538404db30630937225..71916e5cd51162f9c2387769dab4890025fccada 100755
--- a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
index 1fee8780fd2d2501b98e1eda2e4f2b0eaa36689e..873a3e316c90990c058f195495063371ccbf92ce 100644
--- a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
similarity index 84%
rename from bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
index c15b3abf81f8669b4a7387f01e528ffcf857f052..d2d7e5834af879e2b4f50b7e864a2cafb4ca7c17 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
index 63e49a220167a9eb74df771157fd1a77c46a1fce..3d21c2371c63d23055b7c1ef0ac607fe3ccf1dd2 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
similarity index 84%
rename from bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
index 4ada38d39c87395814f706565351bc3a4b295685..be7d58d22c71acdba031145bd387284b55a63367 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
index 8c0c7b1eb45b3a5264af0942b422a51aa22f4637..5481fac80d367a7365ff771482ba20c3de4c70e6 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
similarity index 87%
rename from bench/MatrixMult/Single/MatrixVectorMult32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
index fb5e76256f9c1bf827a923fa3170041ea2a0780b..3a6ee40797a4d5e4607e91a48294cda602c3f927 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
index 9cc90ba5fedefe818bcb6f8734a85d6d398fdcf3..07a22a33449230fd0ee2e95df390605e8719bee2 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixPseudoInv/Direct/bench.py b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
similarity index 81%
rename from bench/MatrixPseudoInv/Direct/bench.py
rename to bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
index fcfb46273a48787b92cdf2ab8c2eef65ab79b655..56c99b43f65664a2cd6b2d3e750a80955e872c13 100755
--- a/bench/MatrixPseudoInv/Direct/bench.py
+++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
@@ -20,7 +20,7 @@ j = {
     "cases": []
 }
 
-size1 = [2, 3, 4, 8, 16]
+size1 = [1, 2, 3, 4, 8, 16]
 size2 = [32, 64, 128, 256]
 
 for (m, n) in itertools.chain(zip(size1, size2), zip(size2, size1)):
diff --git a/bench/MatrixPseudoInv/Direct/main.cpp b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
similarity index 94%
rename from bench/MatrixPseudoInv/Direct/main.cpp
rename to bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
index 6339c24a29c508cdb819a9f5012b300231d4939b..1e559691ec5909f78a836e0cc9679734b2482c08 100644
--- a/bench/MatrixPseudoInv/Direct/main.cpp
+++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot16/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
index 4c4bacd3b4d566bc4676ee8f30120b233e025938..a333391a9ba8377502f36c9f2b3c5ee2e9a1c780 100755
--- a/bench/VectorDotProd/VecDot16/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot16/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
index 4d2179f3da195fbbe969917ab5544c4db7469ffd..d1542064dd78fc9681437dcd872194b5c5491023 100644
--- a/bench/VectorDotProd/VecDot16/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_2/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
index 18d099c07f3f56e28e568c4c526774f38d4c1c59..e0c3df7b225aae0bc723ba74dfb0f5b12441274b 100755
--- a/bench/VectorDotProd/VecDot16_2/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
similarity index 92%
rename from bench/VectorDotProd/VecDot16_2/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
index 356bcfbf7e5d2e6cd1f08c4ad787e7d00edd86c1..b1e24eab16f3a61caba36d5ba8a7795a97f678cc 100644
--- a/bench/VectorDotProd/VecDot16_2/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_2_32bit/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
index 3f1b23afe529ead4f50038ae997537cf43a115d4..2be5f3189b145c6cef29a6bc414f271b5ac12c4c 100755
--- a/bench/VectorDotProd/VecDot16_2_32bit/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
similarity index 92%
rename from bench/VectorDotProd/VecDot16_2_32bit/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
index 2e6377e7b269a68156a47713a2ab8fa900e7b6a9..bc51b6aaeb27a72b427534593cc0ac1819ec12bc 100644
--- a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_32bit/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
index 2dd7bdd9f72699c86943da383ed055ee524087c8..70c0455fe5b2e6bc39fb80b172fd1b97a48ab66a 100755
--- a/bench/VectorDotProd/VecDot16_32bit/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot16_32bit/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
index 0a0f27dbfa654131fe811fd0f96b902498b5a52d..618feebd476ae82c9d36bd4e78899ae6c098db20 100644
--- a/bench/VectorDotProd/VecDot16_32bit/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot32/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
index 13764c41721512188cac1ee623fa35cb5604b714..37a8b266183fd01b80d5abc98f91c2bc5349bcc6 100755
--- a/bench/VectorDotProd/VecDot32/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot32/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
index 5ecf2c1b2d68cb6be0930c9368d7edbd94dd925a..c2aee11db3312bad9b52cb07563d1558ad755d02 100644
--- a/bench/VectorDotProd/VecDot32/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot32_2/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
index c249222f5e4f3ef05855ea796e002ebc95ecace1..0a4b022dc0ada7794a216fec98014a6df4e06981 100755
--- a/bench/VectorDotProd/VecDot32_2/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
similarity index 75%
rename from bench/VectorDotProd/VecDot32_2/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
index 0365c3084fc443885af64516082bda7ccead6313..a379d9e6a9a79f66d64d7e337466f75572fc99e8 100644
--- a/bench/VectorDotProd/VecDot32_2/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -14,12 +14,12 @@ void run_vec_dot_f32_2_perf(uint32_t num_samples, uint32_t num_reps) {
   printf("[VECDOT f32 2] - number of samples = %u, number of iterations = %u\n",
          num_samples, num_reps);
 
-  const std::vector<float> a_re(num_samples);
-  const std::vector<float> a_im(num_samples);
-  const std::vector<float> b_re(num_samples);
-  const std::vector<float> b_im(num_samples);
-  float c_re;
-  float c_im;
+  const std::vector<float32_t> a_re(num_samples);
+  const std::vector<float32_t> a_im(num_samples);
+  const std::vector<float32_t> b_re(num_samples);
+  const std::vector<float32_t> b_im(num_samples);
+  float32_t c_re;
+  float32_t c_im;
 
   const auto *a_re_ptr = a_re.data();
   const auto *a_im_ptr = a_im.data();
diff --git a/bench/ElemWiseVectorMult/VecMul16/bench.py b/bench/BasicMathFun/VectorMult/VecMul16/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul16/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul16/bench.py
index c8d40c39d751ad97f7469ba8ff529620a43d426a..e6f953ef5759d03c4d99bfcd8a7f3bd1a1da227e 100755
--- a/bench/ElemWiseVectorMult/VecMul16/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp
similarity index 91%
rename from bench/ElemWiseVectorMult/VecMul16/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul16/main.cpp
index 3c40c29ced7d3a3014b00dba30f3409e9e684e92..e6999be9c159e1770ae1eb53c72e280d13b96f11 100644
--- a/bench/ElemWiseVectorMult/VecMul16/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul16_2/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
index ee9f806cc2d637b54e93d7412008cf9be5201acf..99f3d83e006b56b23e922c6c02a0270f81a89d3c 100755
--- a/bench/ElemWiseVectorMult/VecMul16_2/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
similarity index 92%
rename from bench/ElemWiseVectorMult/VecMul16_2/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
index 758eece8f5b8e640d541987b5bbab4b63d816090..876dd37ec5eb5ede8e5b02ef8f6d619e8220f103 100644
--- a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32/bench.py b/bench/BasicMathFun/VectorMult/VecMul32/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul32/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul32/bench.py
index b8007ef22849129b2e6e6f88ab54f76c47a5cb33..ac6de3316a0a2ad89d0eb8322ae2ab5f8e302191 100755
--- a/bench/ElemWiseVectorMult/VecMul32/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp
similarity index 91%
rename from bench/ElemWiseVectorMult/VecMul32/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul32/main.cpp
index 76c149aabb94b4246ada61dfa90230820fe3fdcb..07fec66c4555f0163e1f153bb3861dde595e0302 100644
--- a/bench/ElemWiseVectorMult/VecMul32/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul32_2/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
index ea08bf99ac1c039f367b62883ca72586ba212542..c7936ea9dd19b58ef331bb2dd9e5fba18c92f8a6 100755
--- a/bench/ElemWiseVectorMult/VecMul32_2/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
similarity index 74%
rename from bench/ElemWiseVectorMult/VecMul32_2/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
index ec1a2102cc3f3984248b3b64243cd9b5a0a1c4ab..b42d6ad7f61d2f1c371b7594280a66aa33b428b7 100644
--- a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -14,12 +14,12 @@ void run_vec_mul_f32_2_perf(uint32_t num_samples, uint32_t num_reps) {
   printf("[VECMUL f32_2] - number of samples = %u, number of iterations = %u\n",
          num_samples, num_reps);
 
-  const std::vector<float> a_re(num_samples);
-  const std::vector<float> a_im(num_samples);
-  const std::vector<float> b_re(num_samples);
-  const std::vector<float> b_im(num_samples);
-  std::vector<float> c_re(num_samples);
-  std::vector<float> c_im(num_samples);
+  const std::vector<float32_t> a_re(num_samples);
+  const std::vector<float32_t> a_im(num_samples);
+  const std::vector<float32_t> b_re(num_samples);
+  const std::vector<float32_t> b_im(num_samples);
+  std::vector<float32_t> c_re(num_samples);
+  std::vector<float32_t> c_im(num_samples);
 
   const auto *a_re_ptr = a_re.data();
   const auto *a_im_ptr = a_im.data();
diff --git a/bench/MuLaw/Compression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/14bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
index 2b9ab708016d93ef77192d914fdbfe195b5ed65a..95720ea6bd98ffdee2e111e982256ef5226b9642 100755
--- a/bench/MuLaw/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/14bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
index 73b7699aab0b13cd623ab5b9406cc1cf75cee0a8..0cd606cfbc7d21b8934717e365b69743de05fe87 100644
--- a/bench/MuLaw/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/8bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
index 43cefd873b69ffa2750effaf40eb1cc3a92f9a88..f55e33ba31e2f59bba6efc00a3e8615db6c6f3e3 100755
--- a/bench/MuLaw/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/8bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
index 2faa911946b91e517a7e4d8c4bc8423d939f539b..8a489e1b5279b146d52427af796db1ad5d300ee2 100644
--- a/bench/MuLaw/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/9bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
index cc24e674f5583b2de0126ab3ef3e8722d640b175..82fc07f9fbe0735e610c72d563c1e3bcab1e80c7 100755
--- a/bench/MuLaw/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/9bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
index a2c11187070c29a5f8584a3e773272ef9d46265c..f88240ea1c405fe3a89d2a7a9674c07c64761be1 100644
--- a/bench/MuLaw/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/14bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
index 8f6d2b1e18915ffc7e356ec1f7d7cb7e2c6162f2..48cb1fd53707853de750f205c76fdb7274bfdb6a 100755
--- a/bench/MuLaw/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
index a24bf218678907c961d44cb399cfbb9305bcd3db..d0b3498e4f161cb4cfc7e3963a42747d6f3c93a2 100644
--- a/bench/MuLaw/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/8bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
index f70ecafd1da4a1d0e6dce7cf8b112c9add3814b5..0444d8a22f3def7e8e6de0fb8deebb4ceee757a1 100755
--- a/bench/MuLaw/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
index c3a0f0a5a58d86e9c90c5cb8ffe166c31c0b4578..2a50c5d1cbf4f2b06788c42c12036cd6f918b732 100644
--- a/bench/MuLaw/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/9bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
index 67512df561b404e5abdd731e5775ca3cc28f6afa..2bfe560ad7eb47cfb80c8dec8610cff17285df79 100755
--- a/bench/MuLaw/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
index 2bcde05b4f61d09d7ecc16d4e6b7471e70ef1fc9..c3b1b853062c38cd53322fd44bbe1d4c80799669 100644
--- a/bench/MuLaw/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/12bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
index 744bd01eeceae7921b394a09922c7206e23e9589..54bce883feb879c24870910eb14e17c5a8b44fe0 100755
--- a/bench/XRanBlockFloat/Compression/12bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/12bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
index ec36a751d3566f336b1062c34e83ccbb30891dc0..b4c34d7f0f094cb80d3d40d96089049fe0af5a0e 100644
--- a/bench/XRanBlockFloat/Compression/12bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
index 10f2e16f31bfcb14c8407ec24f72b070f03eea8f..3b30a95820d97907c93ce72492a251a2ddcda38f 100755
--- a/bench/XRanBlockFloat/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
index eff869855731e9db8b862dd4fef6e408c8738a62..eb1cc6a235203212367f1980df6cd1b04b98d67f 100644
--- a/bench/XRanBlockFloat/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
index 3e5f2f324f2482ab5700b3cbad461ab6f81cc348..baac1526473ae86fe3e1406528b67c669ef9e51e 100755
--- a/bench/XRanBlockFloat/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
index 1aa7d2c5fd33e0d05fb7e034edfde57096d26774..be88dc196a1a34ceb992a50df5b141566d4c9168 100644
--- a/bench/XRanBlockFloat/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
index 73391e2e5e9b12eca53c62c1db9ab071c12246e7..2dfa15dfdde518c88c41eabed217fd14f39dfc0f 100755
--- a/bench/XRanBlockFloat/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
index 6a96d355bde791d29ec1c6d06a34af2127b4e0fd..a25364282f7f071da17a44abe61f73859120254c 100644
--- a/bench/XRanBlockFloat/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/12bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
index f9ec6f833a8f9a602668aa509259614332a71af8..1a0883aee5c4e0bfd2547c02b6f5130c2d988f24 100755
--- a/bench/XRanBlockFloat/Decompression/12bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/12bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
index 9816ac1759f9d8acfe55b9fa84cd70622e4ef7c2..fc9b8a826ff354bffefa96766f51e3b6b7f830ee 100644
--- a/bench/XRanBlockFloat/Decompression/12bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
index 1f08f502878812af51b09776f185b3bb8cbfdc2b..6b38d699bc883d7a80c171d46e344c1cfa8250d3 100755
--- a/bench/XRanBlockFloat/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
index 52226a5c1a564c53c782e90340d3b45725ab2a9e..285eaa65053344e591034a4c93f4ade0d7c44c55 100644
--- a/bench/XRanBlockFloat/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
index f20eb2b1eaa0886917e3aca46e5e2f71c813fdea..8f6f7e8e693238833306a79fd63312fe35516e3b 100755
--- a/bench/XRanBlockFloat/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
index 7734d1207c3a971d67d3dcfbbad810910924c6e5..8ba5be502407975c9019406958acc7d9723e88fe 100644
--- a/bench/XRanBlockFloat/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
index 5cf57a71a0322043fe233d4eac530e90e3ac6926..c19dff8f0da95652e79dca29920f34cd3bffda7e 100755
--- a/bench/XRanBlockFloat/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
index 1e868ffca297b32471ad9e9226da586056c88feb..807199556336691101aa5d7947d46678ca40dd1a 100644
--- a/bench/XRanBlockFloat/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
index e2b2f15b79c723eaea9174b1be4e48bde3dd3409..efc7012590cce747d2ede6e7a8170c82b179eac2 100755
--- a/bench/ORanBlockScaling/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
index 37f8da1247d9f94b972c6524465921f47058c1f8..754710c0c070d3084547b74d6ed0fe3c88a457d8 100644
--- a/bench/ORanBlockScaling/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
index 65d55372bdc61a47ed7854067eadbdbc3c29e4b5..78bc30807ec658af3053a25f27809c56df671faf 100755
--- a/bench/ORanBlockScaling/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
index 43286ca2accaf71cbd6b788997ee7ec1f2a78be7..73958aee93d16281055171fc291baf288dac4153 100644
--- a/bench/ORanBlockScaling/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
index 54f99318b9b3804b5443a693c4b42bce567b947e..bfcc06884c59d9a4a7632654daabef28323ce863 100755
--- a/bench/ORanBlockScaling/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
index 7d66f4213df7a39a57c416fa2b01c532ceaec326..b3436d7ea3a33529dced72fe0ebb89c42a0f61ff 100644
--- a/bench/ORanBlockScaling/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
index cbb57d29cba049ff58e2b312dc634157ebd34a26..ac7429b7ec8c735eaea5c3e7bf72fd124ec8bdeb 100755
--- a/bench/ORanBlockScaling/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
index a9448f9012e0244fcbe186eb07b4260a1bdec92d..6e0782198c528f7dc73195c4648b93099cabd107 100644
--- a/bench/ORanBlockScaling/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
index 2807325192f8838579b248d47e74135a040b8ae8..7d12222fa227506f03e35ac45759ddf2d9e0ba83 100755
--- a/bench/ORanBlockScaling/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
index 60ffeec519bf26ae99e03dc6abb375e43ff157fc..f5bedca42aa21222b0ea04e29d24857e0d1b41ac 100644
--- a/bench/ORanBlockScaling/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
index f16d82dca3a9266e2e7788a5984ef250a63ccc16..4a2ab15f4e4631c73c4f59dee4fd92efee8f71d1 100755
--- a/bench/ORanBlockScaling/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
index 8cbab20543795c711dc8483bea51d24640f73d7e..2c6aa9e2803a1aed107a8963ec1432b60759e2f1 100644
--- a/bench/ORanBlockScaling/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Correlation/bench.py b/bench/LowerPHY/Correlation/bench.py
similarity index 82%
rename from bench/Correlation/bench.py
rename to bench/LowerPHY/Correlation/bench.py
index 9a36a392b8a09a019dc41643f90c4c8dfef277d1..e2a9be154217c1ac2a99ab915e295567b6b36a54 100755
--- a/bench/Correlation/bench.py
+++ b/bench/LowerPHY/Correlation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Correlation/main.cpp b/bench/LowerPHY/Correlation/main.cpp
similarity index 89%
rename from bench/Correlation/main.cpp
rename to bench/LowerPHY/Correlation/main.cpp
index 068172fcf58334024c260d066c9b6a3d76db64e1..7315a83c3d2c17ea4804edc08a1287193f2f87c7 100644
--- a/bench/Correlation/main.cpp
+++ b/bench/LowerPHY/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT16/bench.py b/bench/LowerPHY/FFT/FFT16/bench.py
similarity index 87%
rename from bench/FFT/FFT16/bench.py
rename to bench/LowerPHY/FFT/FFT16/bench.py
index f560e04ba74a151db2542990f9c41f7a84c5374f..6e2190016f972abfc51456c58b0a2498a47f95e4 100755
--- a/bench/FFT/FFT16/bench.py
+++ b/bench/LowerPHY/FFT/FFT16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT16/main.cpp b/bench/LowerPHY/FFT/FFT16/main.cpp
similarity index 92%
rename from bench/FFT/FFT16/main.cpp
rename to bench/LowerPHY/FFT/FFT16/main.cpp
index 2ce24b081f695a5f995bb048d08849b712f16909..1bf340ee072c27152aab99b61707a9fab320f48b 100644
--- a/bench/FFT/FFT16/main.cpp
+++ b/bench/LowerPHY/FFT/FFT16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT32/bench.py b/bench/LowerPHY/FFT/FFT32/bench.py
similarity index 87%
rename from bench/FFT/FFT32/bench.py
rename to bench/LowerPHY/FFT/FFT32/bench.py
index 83a3e73190defb9bd331d2d313dcc99cca61117b..e84f4fc8647f19b2c6543e15d5da2dbc7f21bb0b 100755
--- a/bench/FFT/FFT32/bench.py
+++ b/bench/LowerPHY/FFT/FFT32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT32/main.cpp b/bench/LowerPHY/FFT/FFT32/main.cpp
similarity index 92%
rename from bench/FFT/FFT32/main.cpp
rename to bench/LowerPHY/FFT/FFT32/main.cpp
index d20456b4fd43594fc02fc3c917e8ceb68eca1a6a..1d469fd6e4176ac0213e836d806f8bfbe894d9de 100644
--- a/bench/FFT/FFT32/main.cpp
+++ b/bench/LowerPHY/FFT/FFT32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16/bench.py b/bench/LowerPHY/FIR/FIR16/bench.py
similarity index 83%
rename from bench/FIR/FIR16/bench.py
rename to bench/LowerPHY/FIR/FIR16/bench.py
index f0b19e8c5665be9b5749efb1dedf097a6fba2e78..23a3626af1df40ef3148faffd95ed51d5022b771 100755
--- a/bench/FIR/FIR16/bench.py
+++ b/bench/LowerPHY/FIR/FIR16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16/main.cpp b/bench/LowerPHY/FIR/FIR16/main.cpp
similarity index 93%
rename from bench/FIR/FIR16/main.cpp
rename to bench/LowerPHY/FIR/FIR16/main.cpp
index 58ee2c8b005047f87a8ea5826cd42ef3d97e7f6a..aae2b728c32a7ef2875dbfb459a23f1217e39f24 100644
--- a/bench/FIR/FIR16/main.cpp
+++ b/bench/LowerPHY/FIR/FIR16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16Decimate2/bench.py b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py
similarity index 84%
rename from bench/FIR/FIR16Decimate2/bench.py
rename to bench/LowerPHY/FIR/FIR16Decimate2/bench.py
index 956ca7cd2f86e34f74c89716ee5fbc4be50dcacf..bd47c5dfef9cc4082a9d1c376655ab9ce8fe4dc4 100755
--- a/bench/FIR/FIR16Decimate2/bench.py
+++ b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
similarity index 93%
rename from bench/FIR/FIR16Decimate2/main.cpp
rename to bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
index 8b8265ae0349a0dc4fe18dda548ac4a14ad8a9ac..f11ee5f142152e21ad15dd1abf52089912e602fd 100644
--- a/bench/FIR/FIR16Decimate2/main.cpp
+++ b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32/bench.py b/bench/LowerPHY/FIR/FIR32/bench.py
similarity index 83%
rename from bench/FIR/FIR32/bench.py
rename to bench/LowerPHY/FIR/FIR32/bench.py
index 86757b6adb4187f94767ac44ce982520d5b34dae..bb24247ddee620d92dabae51d2703b5a91ccc7dd 100755
--- a/bench/FIR/FIR32/bench.py
+++ b/bench/LowerPHY/FIR/FIR32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32/main.cpp b/bench/LowerPHY/FIR/FIR32/main.cpp
similarity index 93%
rename from bench/FIR/FIR32/main.cpp
rename to bench/LowerPHY/FIR/FIR32/main.cpp
index 02e3b087c3a3d635082259f622316003747ad9ca..b376ccda23322072d77c69dcd71b0e4cd0b06aa8 100644
--- a/bench/FIR/FIR32/main.cpp
+++ b/bench/LowerPHY/FIR/FIR32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32Decimate2/bench.py b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py
similarity index 84%
rename from bench/FIR/FIR32Decimate2/bench.py
rename to bench/LowerPHY/FIR/FIR32Decimate2/bench.py
index 41fc6c18739e031b4e7c4b0e57132e1bb5d78e6e..f70853ae2bc88c8dbda8103eabd0f642c7be8936 100755
--- a/bench/FIR/FIR32Decimate2/bench.py
+++ b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
similarity index 93%
rename from bench/FIR/FIR32Decimate2/main.cpp
rename to bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
index b663f1906e5021b2e82f0f34d220c00eede9e8d0..d8ac010935d48da109128f81034f13c852429308 100644
--- a/bench/FIR/FIR32Decimate2/main.cpp
+++ b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Scrambling/bench.py b/bench/LowerPHY/Scrambling/bench.py
similarity index 84%
rename from bench/Scrambling/bench.py
rename to bench/LowerPHY/Scrambling/bench.py
index ae4e285acec471e2f7c10eec60ede7e7adb64178..ad7b7b27cf931a6ec3394b8963a783afb34428a4 100755
--- a/bench/Scrambling/bench.py
+++ b/bench/LowerPHY/Scrambling/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Scrambling/main.cpp b/bench/LowerPHY/Scrambling/main.cpp
similarity index 89%
rename from bench/Scrambling/main.cpp
rename to bench/LowerPHY/Scrambling/main.cpp
index 6d85a8f0ff266773297d47dfd533180eee219f59..5e1985eb2318c79b6459e681f5c541dc3f66db85 100644
--- a/bench/Scrambling/main.cpp
+++ b/bench/LowerPHY/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/SeqGenerator/bench.py b/bench/LowerPHY/SeqGenerator/bench.py
similarity index 84%
rename from bench/SeqGenerator/bench.py
rename to bench/LowerPHY/SeqGenerator/bench.py
index 7d8ae2756d5908d00d8bc712ce6eeb818af7cae5..64db32d9bc4695ba571dbfced1bdc98b727d5e10 100755
--- a/bench/SeqGenerator/bench.py
+++ b/bench/LowerPHY/SeqGenerator/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/SeqGenerator/main.cpp b/bench/LowerPHY/SeqGenerator/main.cpp
similarity index 86%
rename from bench/SeqGenerator/main.cpp
rename to bench/LowerPHY/SeqGenerator/main.cpp
index 49baa2aa1393737b15e9c5c171333ca54c018a24..259e102b9fd65c60f8cfb6edb95b809f10bbde92 100644
--- a/bench/SeqGenerator/main.cpp
+++ b/bench/LowerPHY/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/bench/SVD/bench.py b/bench/MatrixFactorizations/SVD/bench.py
similarity index 86%
rename from bench/SVD/bench.py
rename to bench/MatrixFactorizations/SVD/bench.py
index 22a8591ac0276283eaf1d15c1033fdb34e35d0d5..4cb05bd9cb80d556cd3cb5cc3b1f6fdb95a9ce4c 100755
--- a/bench/SVD/bench.py
+++ b/bench/MatrixFactorizations/SVD/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/SVD/main.cpp b/bench/MatrixFactorizations/SVD/main.cpp
similarity index 89%
rename from bench/SVD/main.cpp
rename to bench/MatrixFactorizations/SVD/main.cpp
index 86cba9298b322b1fe844412743d49a98dd0cd29f..61e54443e8d5c18b48f5e8a0911b80e08a72961c 100644
--- a/bench/SVD/main.cpp
+++ b/bench/MatrixFactorizations/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -19,9 +19,9 @@ void bench_svd(bool gen_singular_vectors, int m, int n, int nreps) {
   std::vector<armral_cmplx_f32_t> a(size, {1.0F, 1.0F});
   const int lda = n;
   for (int i = 0; i < n; ++i) {
-    a[i + lda * i] = armral_cmplx_f32_t{static_cast<float>(i + 2), 0};
+    a[i + lda * i] = armral_cmplx_f32_t{static_cast<float32_t>(i + 2), 0};
   }
-  std::vector<float> s(n);
+  std::vector<float32_t> s(n);
 
   // Left and right singular vectors.
   std::vector<armral_cmplx_f32_t> u;
diff --git a/bench/CRC/11/BigEndian/bench.py b/bench/UpperPHY/CRC/11/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/11/BigEndian/bench.py
rename to bench/UpperPHY/CRC/11/BigEndian/bench.py
index 6c6f668545b78a7d31667646c7339348808e5534..b2c277793943792b47fe2c936705827ea82e8216 100755
--- a/bench/CRC/11/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/11/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/BigEndian/main.cpp b/bench/UpperPHY/CRC/11/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/11/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/11/BigEndian/main.cpp
index d82dbd425e411243d7baf3edf41b9e442d5485b3..a75f3e83ebeaaad92723b7474a571e9b89694244 100644
--- a/bench/CRC/11/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/11/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/11/LittleEndian/bench.py b/bench/UpperPHY/CRC/11/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/11/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/11/LittleEndian/bench.py
index 350c7eae6b0e4e77f07102557aa48c13e409eaf9..bca79a9245bd513fd9db2057f145ae2af13693af 100755
--- a/bench/CRC/11/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/11/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/LittleEndian/main.cpp b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/11/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/11/LittleEndian/main.cpp
index 533b507fc32723f253b2674ffcf5c5081e3bd4c7..0e82518690c4fde32ad43e5ffd8244e41c89c3a4 100644
--- a/bench/CRC/11/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/BigEndian/bench.py b/bench/UpperPHY/CRC/16/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/16/BigEndian/bench.py
rename to bench/UpperPHY/CRC/16/BigEndian/bench.py
index 8bf0fc0da405e78369bdac10fd01120c0ee0d810..738b08acc62f4a8f71eea5d0717daddb135e6f62 100755
--- a/bench/CRC/16/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/16/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/BigEndian/main.cpp b/bench/UpperPHY/CRC/16/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/16/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/16/BigEndian/main.cpp
index a81ccf53f706a73df49fc13475c3f3ced1cbc01f..9265e41f29f6cb5fa133f66579dd8a7c6c083a12 100644
--- a/bench/CRC/16/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/16/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/LittleEndian/bench.py b/bench/UpperPHY/CRC/16/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/16/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/16/LittleEndian/bench.py
index 4c8ce839c7e42ea3061b4614a098930fb4ff0482..5c6cc1ffe8236286d053e19afa18f4cfd949049b 100755
--- a/bench/CRC/16/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/16/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/LittleEndian/main.cpp b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/16/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/16/LittleEndian/main.cpp
index ded10e8a30cf2479dc13a72f8f43cbe958f4cbed..d1cd3439b59cd045194249c83138eeaaaa495eb0 100644
--- a/bench/CRC/16/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/BigEndian/bench.py b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/A/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/A/BigEndian/bench.py
index a69cb7e0c4e2d3b85100a2c440716d9e8d0b99ad..8052caed10115cc3b7cd6392ede854218a59d41b 100755
--- a/bench/CRC/24/A/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/A/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
index ee1e1c766792a8d56483b17a73ced19e5d9d9989..33313dd65771e673a56571695183b8a4070d01cf 100644
--- a/bench/CRC/24/A/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/A/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
index 576bafc2996b9d64c9b04b544b8a5d523a9025d7..64c1ccc59df33aec48678eb89c92442e8c595755 100755
--- a/bench/CRC/24/A/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/A/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
index 17325f8a50a9d8db86917b08074c15c3f1cc5265..7c0e405ae71a56350c65dc072dcf99bedbcdd6ca 100644
--- a/bench/CRC/24/A/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/BigEndian/bench.py b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/B/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/B/BigEndian/bench.py
index aa318554e83202bf5980d86a9b7300a812d4f6c9..739668557c2ab9c7968f33730cebcbae5335a48e 100755
--- a/bench/CRC/24/B/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/B/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
index 876deaf02e27ed6accee0c3d91096e76ce323b41..c557b4707075bccc7621f916791ca44c746b8b6b 100644
--- a/bench/CRC/24/B/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/B/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
index cbd7e953a87b4e8d5c4af1342a2bbc760ece8ba7..06bfea6fe711ea1ba2c196daf2c72da2c4ae63e8 100755
--- a/bench/CRC/24/B/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/B/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
index b19eb358d9e7c5233646b1870f3c9ed38b4e2e9e..b332e1b6d050fdf2ce26cb619346327393c26dd0 100644
--- a/bench/CRC/24/B/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/BigEndian/bench.py b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/C/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/C/BigEndian/bench.py
index 42303ee6b27b11e0cbe44160eaf2563140a8d2dd..1df67fdc8513edb4da41c85e814b9b21fbea957f 100755
--- a/bench/CRC/24/C/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/C/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
index e1a18f2fa8cc0739a54e5cacd843fd50c7b4415b..f4d8553de7ef5703ad67b680b7a3a25227fbfc20 100644
--- a/bench/CRC/24/C/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/C/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
index 331bb2609f9b4fdad5f02d0826f1ee10682611ff..70471b5c177def2b78a935576bba7c4be486fa43 100755
--- a/bench/CRC/24/C/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/C/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
index d9c0a813ce2e6b934aaf77b6c774cb1e759f0475..f3cfbc59c19d242b41d76a4016de8794eae3c678 100644
--- a/bench/CRC/24/C/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/BigEndian/bench.py b/bench/UpperPHY/CRC/6/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/6/BigEndian/bench.py
rename to bench/UpperPHY/CRC/6/BigEndian/bench.py
index bb642253c2ec69c39e26ee273f653698bddf0e37..1bc3711217effee82161458c2178e6a28f9c98df 100755
--- a/bench/CRC/6/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/6/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/BigEndian/main.cpp b/bench/UpperPHY/CRC/6/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/6/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/6/BigEndian/main.cpp
index b74b808383259a730d5ee47bdc8a93bb9abef47a..3ed97a408fd00d9481746f4d8150a15273785e1d 100644
--- a/bench/CRC/6/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/6/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/LittleEndian/bench.py b/bench/UpperPHY/CRC/6/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/6/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/6/LittleEndian/bench.py
index 7878f82c8dccd3d3c2d1ece0bdb252c1cf778265..7cb63784161c491962bc68ffb2b4f4c5987a1269 100755
--- a/bench/CRC/6/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/6/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/LittleEndian/main.cpp b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/6/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/6/LittleEndian/main.cpp
index 8363eae8ab05689f78965ee47ea307d05a3f52be..ab6958ec35eecc5905b2ef66490854072907d80e 100644
--- a/bench/CRC/6/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Decoding/bench.py b/bench/UpperPHY/ConvolutionalDecoder/bench.py
similarity index 85%
rename from bench/ConvCoding/Decoding/bench.py
rename to bench/UpperPHY/ConvolutionalDecoder/bench.py
index 16ebdb05111357834cc49305b3ccc4b4eff68ada..f9c42b3ce3f5472964f02c661e584c8ad0338b4c 100755
--- a/bench/ConvCoding/Decoding/bench.py
+++ b/bench/UpperPHY/ConvolutionalDecoder/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Decoding/main.cpp b/bench/UpperPHY/ConvolutionalDecoder/main.cpp
similarity index 94%
rename from bench/ConvCoding/Decoding/main.cpp
rename to bench/UpperPHY/ConvolutionalDecoder/main.cpp
index 8bc34dda6ee2c143295332c49399c9bfb0c5d2d5..fbcfd53dc05e42c5428906bca3edf3fc9280d0ac 100644
--- a/bench/ConvCoding/Decoding/main.cpp
+++ b/bench/UpperPHY/ConvolutionalDecoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Encoding/bench.py b/bench/UpperPHY/ConvolutionalEncoder/bench.py
similarity index 85%
rename from bench/ConvCoding/Encoding/bench.py
rename to bench/UpperPHY/ConvolutionalEncoder/bench.py
index fca556c2f1b950f510686d716f2bd518453b4c6a..7dc34b60cac304cf2c88c5e2b28bdfcc6ab6f80c 100755
--- a/bench/ConvCoding/Encoding/bench.py
+++ b/bench/UpperPHY/ConvolutionalEncoder/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Encoding/main.cpp b/bench/UpperPHY/ConvolutionalEncoder/main.cpp
similarity index 92%
rename from bench/ConvCoding/Encoding/main.cpp
rename to bench/UpperPHY/ConvolutionalEncoder/main.cpp
index 8221726656c6bb5cf4238cbafb0437e2c953f891..65b9941ac2e9eb406fc1a08052fba494fce5b1c6 100644
--- a/bench/ConvCoding/Encoding/main.cpp
+++ b/bench/UpperPHY/ConvolutionalEncoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Demodulation/bench.py b/bench/UpperPHY/Demodulation/bench.py
similarity index 85%
rename from bench/Demodulation/bench.py
rename to bench/UpperPHY/Demodulation/bench.py
index 051554d0168def88ca7b850107bc6745c12450d8..1a099a6c075709fa3e46c44adcdcbe5681cb5d2a 100755
--- a/bench/Demodulation/bench.py
+++ b/bench/UpperPHY/Demodulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Demodulation/main.cpp b/bench/UpperPHY/Demodulation/main.cpp
similarity index 95%
rename from bench/Demodulation/main.cpp
rename to bench/UpperPHY/Demodulation/main.cpp
index d6b9c926484636062cbb04a0914de11dc2e59049..e4e06fd417ee06323a1f6124d7703d1d41ec4c3c 100644
--- a/bench/Demodulation/main.cpp
+++ b/bench/UpperPHY/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/LDPC/Decoding/bench.py b/bench/UpperPHY/LDPC/Decoding/bench.py
similarity index 92%
rename from bench/LDPC/Decoding/bench.py
rename to bench/UpperPHY/LDPC/Decoding/bench.py
index 620ad1238b367ba0775a7a38cf0843622e06834b..0476cc7df0f5c2b1796f224a72db3db218ca2c76 100755
--- a/bench/LDPC/Decoding/bench.py
+++ b/bench/UpperPHY/LDPC/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/LDPC/Decoding/main.cpp b/bench/UpperPHY/LDPC/Decoding/main.cpp
similarity index 95%
rename from bench/LDPC/Decoding/main.cpp
rename to bench/UpperPHY/LDPC/Decoding/main.cpp
index 9d26974555fd994bdd9df11b40add1cc63d63885..85acce39cb6dd3303529576a6259439d79b4137c 100755
--- a/bench/LDPC/Decoding/main.cpp
+++ b/bench/UpperPHY/LDPC/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/bench/LDPC/Encoding/bench.py b/bench/UpperPHY/LDPC/Encoding/bench.py
similarity index 90%
rename from bench/LDPC/Encoding/bench.py
rename to bench/UpperPHY/LDPC/Encoding/bench.py
index dd8f9d406d101c3b239df605e4e07e09668d54cd..3a8e7fb8724d0ec55e58aeeaec73f611928767e4 100755
--- a/bench/LDPC/Encoding/bench.py
+++ b/bench/UpperPHY/LDPC/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/LDPC/Encoding/main.cpp b/bench/UpperPHY/LDPC/Encoding/main.cpp
similarity index 95%
rename from bench/LDPC/Encoding/main.cpp
rename to bench/UpperPHY/LDPC/Encoding/main.cpp
index d7c075a26b5ce3b2fee0699da35ddb31ab3713dd..cbc6cbd65643619cc7ac1d172c08285b3eb52257 100644
--- a/bench/LDPC/Encoding/main.cpp
+++ b/bench/UpperPHY/LDPC/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/bench/LDPC/RateMatching/bench.py b/bench/UpperPHY/LDPC/RateMatching/bench.py
similarity index 90%
rename from bench/LDPC/RateMatching/bench.py
rename to bench/UpperPHY/LDPC/RateMatching/bench.py
index 5d752eca3e5aaa5797e6590031c139d591d13226..cc49114dc0af421d1268e828c1a7a67bb774196c 100755
--- a/bench/LDPC/RateMatching/bench.py
+++ b/bench/UpperPHY/LDPC/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/LDPC/RateMatching/main.cpp b/bench/UpperPHY/LDPC/RateMatching/main.cpp
similarity index 96%
rename from bench/LDPC/RateMatching/main.cpp
rename to bench/UpperPHY/LDPC/RateMatching/main.cpp
index d99459abbabb743253e6330c12ff5931ecba43f6..5aa17c545152f66138f193a6ded0050ec0d498d5 100644
--- a/bench/LDPC/RateMatching/main.cpp
+++ b/bench/UpperPHY/LDPC/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
diff --git a/bench/LDPC/RateRecovery/bench.py b/bench/UpperPHY/LDPC/RateRecovery/bench.py
similarity index 90%
rename from bench/LDPC/RateRecovery/bench.py
rename to bench/UpperPHY/LDPC/RateRecovery/bench.py
index 02463ca68f0ca3a326a39df41ef032c1a0cdb459..8c0004963550d79a64be7ca95dbf0a68033570bd 100755
--- a/bench/LDPC/RateRecovery/bench.py
+++ b/bench/UpperPHY/LDPC/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/LDPC/RateRecovery/main.cpp b/bench/UpperPHY/LDPC/RateRecovery/main.cpp
similarity index 95%
rename from bench/LDPC/RateRecovery/main.cpp
rename to bench/UpperPHY/LDPC/RateRecovery/main.cpp
index af9e0566e8eec31512552a8306435a66318e79cf..469a4bdb73f602c446d54274260df5e4da69bb8b 100644
--- a/bench/LDPC/RateRecovery/main.cpp
+++ b/bench/UpperPHY/LDPC/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
diff --git a/bench/Modulation/bench.py b/bench/UpperPHY/Modulation/bench.py
similarity index 86%
rename from bench/Modulation/bench.py
rename to bench/UpperPHY/Modulation/bench.py
index e6dcff60cbc8b5b21d0b59f7a7220c2ebf1ea01e..9933b7b626a84c552845ff4c809453e88f17bfb1 100755
--- a/bench/Modulation/bench.py
+++ b/bench/UpperPHY/Modulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Modulation/main.cpp b/bench/UpperPHY/Modulation/main.cpp
similarity index 95%
rename from bench/Modulation/main.cpp
rename to bench/UpperPHY/Modulation/main.cpp
index 5e0f7ba9848dba69a5121fc020b67f6c425621e4..bb777f019efac247d48efa9ac25e3f0f0d5b6491 100644
--- a/bench/Modulation/main.cpp
+++ b/bench/UpperPHY/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Decoding/bench.py b/bench/UpperPHY/Polar/Decoding/bench.py
similarity index 88%
rename from bench/Polar/Decoding/bench.py
rename to bench/UpperPHY/Polar/Decoding/bench.py
index b9b3ad64cf9c913596c879f2781232af032e1543..5cddc12e2f4eeca980d717bf1e94b729535c9966 100755
--- a/bench/Polar/Decoding/bench.py
+++ b/bench/UpperPHY/Polar/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Decoding/main.cpp b/bench/UpperPHY/Polar/Decoding/main.cpp
similarity index 89%
rename from bench/Polar/Decoding/main.cpp
rename to bench/UpperPHY/Polar/Decoding/main.cpp
index 6da1928fc9c0ba8d85dbd0cd78f8982a79bba9ae..31e89fb160a8564e09b5388fa164e356fef74354 100644
--- a/bench/Polar/Decoding/main.cpp
+++ b/bench/UpperPHY/Polar/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "rng.hpp"
@@ -30,8 +30,8 @@ void run_polar_decoding_perf(uint32_t n, uint32_t k, uint32_t l,
   // microarchitectural branch prediction are too optimistic and give an
   // unrealistically fast result. We use a linear congruential generator to
   // avoid calling rand() or C++ random number generators, which are both slow.
-  linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({42});
+  armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({42});
   for (uint32_t i = 0; i < n; ++i) {
     ((uint8_t *)a.data())[i] = lcg.one<uint32_t>(&state);
   }
diff --git a/bench/Polar/Encoding/bench.py b/bench/UpperPHY/Polar/Encoding/bench.py
similarity index 84%
rename from bench/Polar/Encoding/bench.py
rename to bench/UpperPHY/Polar/Encoding/bench.py
index d01db8b628d638642ae360e595e8308ba6109bcc..d05b5db297c57e7c2cf0f4154f54502b549170b2 100755
--- a/bench/Polar/Encoding/bench.py
+++ b/bench/UpperPHY/Polar/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Encoding/main.cpp b/bench/UpperPHY/Polar/Encoding/main.cpp
similarity index 89%
rename from bench/Polar/Encoding/main.cpp
rename to bench/UpperPHY/Polar/Encoding/main.cpp
index 86bd403b4ea42700bbb0346ba2dcf3722a298ba7..a1cab8be0603d011f1a58d97fa77c4080665da00 100644
--- a/bench/Polar/Encoding/main.cpp
+++ b/bench/UpperPHY/Polar/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Frozen/bench.py b/bench/UpperPHY/Polar/Frozen/bench.py
similarity index 93%
rename from bench/Polar/Frozen/bench.py
rename to bench/UpperPHY/Polar/Frozen/bench.py
index c25d3c8d616184443521632692cba30a9558cf46..50648a21f9586993e8715e5c40acc6f5edf95f7b 100755
--- a/bench/Polar/Frozen/bench.py
+++ b/bench/UpperPHY/Polar/Frozen/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Frozen/main.cpp b/bench/UpperPHY/Polar/Frozen/main.cpp
similarity index 93%
rename from bench/Polar/Frozen/main.cpp
rename to bench/UpperPHY/Polar/Frozen/main.cpp
index 8db346aafbe8462e2d3467403310f0a6cab052fd..5ba5e3553f81617b365db33b4d5602bc751dcc42 100644
--- a/bench/Polar/Frozen/main.cpp
+++ b/bench/UpperPHY/Polar/Frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateMatching/bench.py b/bench/UpperPHY/Polar/RateMatching/bench.py
similarity index 91%
rename from bench/Polar/RateMatching/bench.py
rename to bench/UpperPHY/Polar/RateMatching/bench.py
index fa5715ff2b32375cce9057c0b732856e65b757e4..92c0535df5c85eb43493b011a09fe62d2f92992d 100755
--- a/bench/Polar/RateMatching/bench.py
+++ b/bench/UpperPHY/Polar/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateMatching/main.cpp b/bench/UpperPHY/Polar/RateMatching/main.cpp
similarity index 94%
rename from bench/Polar/RateMatching/main.cpp
rename to bench/UpperPHY/Polar/RateMatching/main.cpp
index af6a831645b67c36eebcf9211094d5ade41f804c..a5bf08ab34dfd21520363b2c69b7da6f9f6166d9 100644
--- a/bench/Polar/RateMatching/main.cpp
+++ b/bench/UpperPHY/Polar/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateRecovery/bench.py b/bench/UpperPHY/Polar/RateRecovery/bench.py
similarity index 91%
rename from bench/Polar/RateRecovery/bench.py
rename to bench/UpperPHY/Polar/RateRecovery/bench.py
index 4687b6dfeb447b44dc007e0d6bcc3a8ff7fff0b2..a2a2c3f8db5583d830b29a3f5e3eee88e4050f04 100755
--- a/bench/Polar/RateRecovery/bench.py
+++ b/bench/UpperPHY/Polar/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateRecovery/main.cpp b/bench/UpperPHY/Polar/RateRecovery/main.cpp
similarity index 94%
rename from bench/Polar/RateRecovery/main.cpp
rename to bench/UpperPHY/Polar/RateRecovery/main.cpp
index b687110b0a99d159e66f7e7296b6b1f15a3436d2..019b4a58508540ae45a2100076d176005dc57808 100644
--- a/bench/Polar/RateRecovery/main.cpp
+++ b/bench/UpperPHY/Polar/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelDeinterleave/bench.py b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
similarity index 88%
rename from bench/Polar/SubchannelDeinterleave/bench.py
rename to bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
index d804d3bb03d333815421e994a9383e65b71db150..29fd5bc331ac674c0944adc34987c006a0872aa1 100755
--- a/bench/Polar/SubchannelDeinterleave/bench.py
+++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelDeinterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
similarity index 93%
rename from bench/Polar/SubchannelDeinterleave/main.cpp
rename to bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
index 54e910869df460f1e41978ab7cb1fa27c439b18b..e5bb27dc699bf7522984af99fb0d4e3e7fb1f0fb 100644
--- a/bench/Polar/SubchannelDeinterleave/main.cpp
+++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelInterleave/bench.py b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py
similarity index 88%
rename from bench/Polar/SubchannelInterleave/bench.py
rename to bench/UpperPHY/Polar/SubchannelInterleave/bench.py
index 8620391efd785cd4f07f90587da1af740bf7e3d8..de89975decfd9391e7a0cb2e499f508d097f863a 100755
--- a/bench/Polar/SubchannelInterleave/bench.py
+++ b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelInterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
similarity index 93%
rename from bench/Polar/SubchannelInterleave/main.cpp
rename to bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
index c2623be63b33cb6912e371255ee4542e9dff1e49..01d3db481fec644568e822a0401537ac6f57eac5 100644
--- a/bench/Polar/SubchannelInterleave/main.cpp
+++ b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/Decoding/bench.py b/bench/UpperPHY/Turbo/Decoding/bench.py
similarity index 84%
rename from bench/Turbo/Decoding/bench.py
rename to bench/UpperPHY/Turbo/Decoding/bench.py
index ebd3e3868c53c85f80cc942725dd0a1c09d9cc89..11c546a87d579866cd598912bd94a2471a80eb2e 100755
--- a/bench/Turbo/Decoding/bench.py
+++ b/bench/UpperPHY/Turbo/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Decoding/main.cpp b/bench/UpperPHY/Turbo/Decoding/main.cpp
similarity index 94%
rename from bench/Turbo/Decoding/main.cpp
rename to bench/UpperPHY/Turbo/Decoding/main.cpp
index b0e21bbd7bf5e2caff2d2e5f87cea24176a4ca14..1362f23d082e521cbaa79e3fa8894b1f4d6ce86a 100644
--- a/bench/Turbo/Decoding/main.cpp
+++ b/bench/UpperPHY/Turbo/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
@@ -42,13 +42,13 @@ void run_turbo_decoding_perf(const uint32_t num_prbs, const uint32_t num_bits,
       buffer_bump_allocator allocator{buffer.data()};
       armral::turbo::decode_block<false, buffer_bump_allocator>(
           sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4),
-          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.0,
+          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F,
           num_iters, allocator);
 #else
       heap_allocator allocator{};
       armral::turbo::decode_block<false, heap_allocator>(
           sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4),
-          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.0,
+          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F,
           num_iters, allocator);
 #endif
     }
diff --git a/bench/Turbo/Encoding/bench.py b/bench/UpperPHY/Turbo/Encoding/bench.py
similarity index 84%
rename from bench/Turbo/Encoding/bench.py
rename to bench/UpperPHY/Turbo/Encoding/bench.py
index 5c1db10cd9b1482bd060a0d64f6517ef16fb379b..a50972fe57b95ac08163f2ca58064ad72a5f0632 100755
--- a/bench/Turbo/Encoding/bench.py
+++ b/bench/UpperPHY/Turbo/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Encoding/main.cpp b/bench/UpperPHY/Turbo/Encoding/main.cpp
similarity index 95%
rename from bench/Turbo/Encoding/main.cpp
rename to bench/UpperPHY/Turbo/Encoding/main.cpp
index b79df85cb7918faed9f697dbf863fb575adcf112..a4d39796a695e4e4da277f5b60be1ad3b207ab8b 100644
--- a/bench/Turbo/Encoding/main.cpp
+++ b/bench/UpperPHY/Turbo/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateMatching/bench.py b/bench/UpperPHY/Turbo/RateMatching/bench.py
similarity index 86%
rename from bench/Turbo/RateMatching/bench.py
rename to bench/UpperPHY/Turbo/RateMatching/bench.py
index a36a1eab059a2a1c2d028c9e5fd303fecea3a439..9ba9ee120131a1757f32ba417afea6b7b2db95bc 100755
--- a/bench/Turbo/RateMatching/bench.py
+++ b/bench/UpperPHY/Turbo/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateMatching/main.cpp b/bench/UpperPHY/Turbo/RateMatching/main.cpp
similarity index 95%
rename from bench/Turbo/RateMatching/main.cpp
rename to bench/UpperPHY/Turbo/RateMatching/main.cpp
index 809bf148a0be7ef96bbb09fa655d42d45583b821..3148fa28c57b267c46f16403eac9dc4f45bfa1da 100644
--- a/bench/Turbo/RateMatching/main.cpp
+++ b/bench/UpperPHY/Turbo/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateRecovery/bench.py b/bench/UpperPHY/Turbo/RateRecovery/bench.py
similarity index 86%
rename from bench/Turbo/RateRecovery/bench.py
rename to bench/UpperPHY/Turbo/RateRecovery/bench.py
index 2cc54c28695c2e54b8923632b4bb60098aea21ea..3e74ded5ae35a8b9f703d7a97bc6028f4a8e499e 100755
--- a/bench/Turbo/RateRecovery/bench.py
+++ b/bench/UpperPHY/Turbo/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateRecovery/main.cpp b/bench/UpperPHY/Turbo/RateRecovery/main.cpp
similarity index 95%
rename from bench/Turbo/RateRecovery/main.cpp
rename to bench/UpperPHY/Turbo/RateRecovery/main.cpp
index 61d0e780cda38dd51d6c56996f8bdb68670025d0..38795a31f4072a0022288fb5a17b9be5eff800fc 100644
--- a/bench/Turbo/RateRecovery/main.cpp
+++ b/bench/UpperPHY/Turbo/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/benchmarker.py b/bench/benchmarker.py
index d3c2d6acc139f78bfa7ae641c8e83bcd8b7a9574..5a42cd30e2160ced1bdb71f371aaec613680b14d 100755
--- a/bench/benchmarker.py
+++ b/bench/benchmarker.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 
 # This program is for benchmarking the performance of armral functions.
@@ -168,7 +168,7 @@ def ignore_sigint():
 def run_benchmarks_concurrent(cases):
     num_procs = multiprocessing.cpu_count()
     with multiprocessing.Pool(processes=num_procs, initializer=ignore_sigint, maxtasksperchild=1) as pool:
-        # serialise display_result rather than using imap to avoid racing prints.
+        # serialize display_result rather than using imap to avoid racing prints.
         return max(map(display_result, pool.imap(run_case, cases)))
 
 
diff --git a/bench/benchmarker_utils.py b/bench/benchmarker_utils.py
index c369eb20a1c3746182e7059bcc71192070e2b079..370395d98985da5bdfdc9004c075436d35c374a3 100755
--- a/bench/benchmarker_utils.py
+++ b/bench/benchmarker_utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 import collections
 import os
 import subprocess
@@ -12,13 +12,15 @@ NETFUL_EXPECTED_TIMEOUT_RETCODE = 3
 NETFUL_ALLOW_ERROR_RETCODE = 4
 
 
-def shell(cmd, check=True):
+def shell(cmd, check=True, **kwargs):
     """
     Run cmd on the command line and return stdout. Throws an exception
-    if the return code is non-zero.
+    if the return code is non-zero. Remaining kwargs are passed on to
+    subprocess.run().
     """
     result = subprocess.run(cmd, stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE, check=check)
+                            stderr=subprocess.PIPE, check=check,
+                            **kwargs)
     return ShellResult(result.returncode,
                        result.stdout.decode("utf-8"),
                        result.stderr.decode("utf-8"))
diff --git a/bench/default_runner.py b/bench/default_runner.py
index e5cb3ca87938f3cdf83d73c41eb5e784ba08483c..ee02254ce0d9a2d660e4aae892cdc8630584ad51 100755
--- a/bench/default_runner.py
+++ b/bench/default_runner.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import argparse
diff --git a/docs/doxywrapper/arm_infra_html.css b/docs/doxywrapper/arm_infra_html.css
index f21550b86a8372d70e67cc0e65c73223d7d4b946..1a83e1d86292d65f531f9b990b936acf81d030a5 100644
--- a/docs/doxywrapper/arm_infra_html.css
+++ b/docs/doxywrapper/arm_infra_html.css
@@ -1333,7 +1333,7 @@ tr.heading h2 {
 }
 
 #powerTip div.ttdoc {
-        color: grey;
+        color: gray;
 	font-style: italic;
 }
 
diff --git a/docs/examples.md b/docs/examples.md
index ebfeff5855d9ce515557f04f1eb9a89a2af86705..c789c0d88bbbcce9b3a4ac2655abbbcb55c7a2a0 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -3,7 +3,7 @@
 This topic describes how to compile and link your application code to Arm RAN
 Acceleration Library (ArmRAL).
 
-# Before you begin
+## Before you begin
 
 * Ensure you have a recent version of a C/C++ compiler, such as GCC. See the
   Release Notes for a full list of supported GCC versions.
@@ -17,9 +17,9 @@ Acceleration Library (ArmRAL).
 
   To build the library, use:
 
-      tar zxvf ral-armral-24.01.tar.gz
-      mkdir ral-armral-24.01/build
-      cd ral-armral-24.01/build
+      git clone -b armral-24.04 https://git.gitlab.arm.com/networking/ral.git
+      mkdir ral/build
+      cd ral/build
       cmake ..
       make -j
 
@@ -28,7 +28,7 @@ Acceleration Library (ArmRAL).
 
       #include "armral.h"
 
-# Procedure
+## Procedure
 
 1. Build and link your program with Arm RAN Acceleration Library. For GCC, use:
 
@@ -49,7 +49,7 @@ Acceleration Library (ArmRAL).
 
        ./<binary-filename>
 
-# Example: Run 'fft_cf32_example.c'
+## Example: Run 'fft_cf32_example.c'
 
 In this example, we use Arm RAN Acceleration Library to compute and
 solve a simple Fast Fourier Transform (FFT) problem.
@@ -103,8 +103,7 @@ The following source file can be found in the ArmRAL source directory under
          (-3.312299 + 1.687701i)
          (-5.940955 + -0.940955i)
 
-
-# Other examples: block-float, modulation, and polar examples
+## Other examples: block-float, modulation, and polar examples
 
 Arm RAN Acceleration Library also includes block-float, modulation, and polar
 examples. These example files can also be found in the `/examples/` directory.
@@ -157,11 +156,11 @@ included:
   `polar_example`, with an input array of `N = 128`, `E = 100`, and `K = 35`,
   use:
 
-      ./modulation_example 128 100 35
+      ./polar_example 128 100 35
 
 Each example can be run according to the **Procedure** described above, as
 demonstrated in the **Example: Run 'fft_cf32_example.c'** section.
 
-# Related information
+## Related information
 
 For more information, see the **README** file.
diff --git a/docs/frontmatter.md b/docs/frontmatter.md
index c5fd9bfe45a9a286bc0d31e9619da2c05b1fd5fc..fe559072e2ce5c37b9f23204cf8c0135793c5dab 100644
--- a/docs/frontmatter.md
+++ b/docs/frontmatter.md
@@ -2,17 +2,27 @@
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
-# About this book
+## About this book
 
 This book contains reference documentation for Arm RAN Acceleration Library
 (ArmRAL). The book was generated from the source code using Doxygen.
 
-Arm RAN Acceleration Library contains a set of functions for accelerating
-telecommunications applications such as, but not limited to, 5G Radio Access
-Networks (RANs).
+Arm RAN Acceleration Library provides optimized signal processing and related
+maths functions for enabling 5G Radio Access Network (RAN) deployments. It
+leverages the efficient vector units available on Arm cores that support the
+Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads,
+including:
+
+* Matrix and vector arithmetic, such as matrix multiplication.
+* Fast Fourier Transforms (FFTs).
+* Digital modulation and demodulation.
+* Cyclic Redundancy Check (CRC).
+* Encoding and decoding schemes, including Polar, Low-Density Parity
+  Check (LDPC), and Turbo.
+* Compression and decompression.
 
 You can download Arm RAN Acceleration Library for free from
-https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download.
+<https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download>.
 
 Arm RAN Acceleration Library is built as a static library, and must be linked in
 to any executable that needs to use the library. The source code can be
@@ -22,9 +32,9 @@ in the `src` directory, testing code is located in the `test` directory,
 benchmarking code is located in the `bench` directory, and examples are
 located in the `examples` directory.
 
-# Feedback
+## Feedback
 
-## Feedback on this product
+### Feedback on this product
 
 If you have any comments or suggestions about this product, contact your
 supplier and give:
@@ -34,18 +44,18 @@ supplier and give:
 * An explanation with as much information as you can provide. Include symptoms
   and diagnostic procedures if appropriate.
 
-## Feedback on content
+### Feedback on content
 
 If you have any comments on content, send an e-mail to errata@arm.com. Give:
 
 * The title Arm RAN Acceleration Library Reference Guide.
-* The number 102249_2401_00_en.
+* The number 102249_2404_00_en.
 * If applicable, the relevant page number(s) to which your comments refer.
 * A concise explanation of your comments.
 
 Arm also welcomes general suggestions for additions and improvements.
 
-# Non-Confidential Proprietary Notice
+## Non-Confidential Proprietary Notice
 
 This document is protected by copyright and other related rights and the
 practice or implementation of the information contained in this document may be
@@ -93,7 +103,7 @@ The Arm corporate logo and words marked with ® or ™ are registered trademarks
 trademarks of Arm Limited (or its affiliates) in the US and/or elsewhere. All
 rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm's trademark usage
-guidelines at https://www.arm.com/company/policies/trademarks.
+guidelines at <https://www.arm.com/company/policies/trademarks>.
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
@@ -103,7 +113,7 @@ Arm Limited. Company 02557590 registered in England.
 
 (LES-PRE-20349)
 
-# Confidentiality Status
+## Confidentiality Status
 
 This document is Non-Confidential. The right to use, copy and disclose this
 document may be subject to license restrictions in accordance with the terms of
@@ -112,15 +122,15 @@ to.
 
 Unrestricted Access is an Arm internal classification.
 
-# Product Status
+## Product Status
 
 The information in this document is Final, that is for a developed product.
 
-# Web Address
+## Web Address
 
-https://developer.arm.com
+<https://developer.arm.com>
 
-# Progressive terminology commitment
+## Progressive terminology commitment
 
 Arm values inclusive communities. Arm recognizes that we and our industry have
 used language that can be offensive. Arm strives to lead the industry and create
@@ -129,9 +139,9 @@ change.
 We believe that this document contains no offensive terms.
 If you find offensive terms in this document, please contact terms@arm.com.
 
-# Release Information
+## Release Information
 
-## Document History
+### Document History
 
 Issue   | Date            | Confidentiality  | Change
 --------|-----------------|------------------|-----------------------------------------------------
@@ -149,3 +159,4 @@ Issue   | Date            | Confidentiality  | Change
 2307-00 | 07 July 2023    | Non-Confidential | Update for Arm RAN Acceleration Library v23.07
 2310-00 | 06 October 2023 | Non-Confidential | Update for Arm RAN Acceleration Library v23.10
 2401-00 | 19 January 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.01
+2404-00 | 19 April 2024   | Non-Confidential | Update for Arm RAN Acceleration Library v24.04
diff --git a/examples/block_float_9b_example.c b/examples/block_float_9b_example.c
index 8abe390708dbefbfdd954b074a402d6dc44f552f..1a48eb930537e160ceb62009db2f6cdd13b93d97 100644
--- a/examples/block_float_9b_example.c
+++ b/examples/block_float_9b_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/fft_cf32_example.c b/examples/fft_cf32_example.c
index 690d876d3b9f0d17d56371c754e9098c7ba9a87d..4fc4762dbca8012a487ed5b0bd93056b63199c92 100644
--- a/examples/fft_cf32_example.c
+++ b/examples/fft_cf32_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -17,7 +17,7 @@ static void example_fft_plan_and_execute(int n) {
   armral_fft_create_plan_cf32(&p, n, -1);
 
   // Create the data that is to be used in FFTs. The input array (x) needs to
-  // be initialised. The output array (y) does not.
+  // be initialized. The output array (y) does not.
   armral_cmplx_f32_t *x =
       (armral_cmplx_f32_t *)malloc(n * sizeof(armral_cmplx_f32_t));
   armral_cmplx_f32_t *y =
diff --git a/examples/modulation_example.c b/examples/modulation_example.c
index 3ee95d6ef229a402573e9ae7f1d81da027c3392d..94538ac816aa23de7124c876feaa3840f3bd42ed 100644
--- a/examples/modulation_example.c
+++ b/examples/modulation_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/polar_example.cpp b/examples/polar_example.cpp
index 5648f308dc9990b73d03872b07e4f9e44aaa6e9d..d2b9f814c9726b5917c6f7bd401ea459d31b487c 100644
--- a/examples/polar_example.cpp
+++ b/examples/polar_example.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/include/armral.h b/include/armral.h
index 2f247eb1f08ef444ec9b687c98200e1a298800f1..60592f02ce993de2ff59b50cf60dabb79aadbf27 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -193,8 +193,8 @@ typedef struct {
  * \brief 32-bit floating-point complex data type.
  */
 typedef struct {
-  float re; ///< 32-bit real component.
-  float im; ///< 32-bit imaginary component.
+  float32_t re; ///< 32-bit real component.
+  float32_t im; ///< 32-bit imaginary component.
 } armral_cmplx_f32_t;
 
 /**
@@ -418,10 +418,11 @@ armral_status armral_cmplx_vecmul_f32(int32_t n, const armral_cmplx_f32_t *a,
  * @param[out] c_im  Points to the imaginary part of the output result.
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *a_re,
-                                        const float *a_im, const float *b_re,
-                                        const float *b_im, float *c_re,
-                                        float *c_im);
+armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float32_t *a_re,
+                                        const float32_t *a_im,
+                                        const float32_t *b_re,
+                                        const float32_t *b_im, float32_t *c_re,
+                                        float32_t *c_im);
 
 /** @} end of cmplx_by_cmplx_mult group */
 
@@ -489,11 +490,12 @@ armral_status armral_cmplx_vecdot_f32(int32_t n,
  * @param[out] p_src_c_im  Points to the imaginary part of the output result.
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_cmplx_vecdot_f32_2(int32_t n, const float *p_src_a_re,
-                                        const float *p_src_a_im,
-                                        const float *p_src_b_re,
-                                        const float *p_src_b_im,
-                                        float *p_src_c_re, float *p_src_c_im);
+armral_status armral_cmplx_vecdot_f32_2(int32_t n, const float32_t *p_src_a_re,
+                                        const float32_t *p_src_a_im,
+                                        const float32_t *p_src_b_re,
+                                        const float32_t *p_src_b_im,
+                                        float32_t *p_src_c_re,
+                                        float32_t *p_src_c_im);
 
 /**
  * This algorithm computes the dot product between a pair of arrays of complex
@@ -851,7 +853,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_i16_32bit_pa(
 /**
  * This algorithm performs the multiplication `A x` for matrix `A` and vector
  * `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * @param[in]     m          The number of rows in matrix `A` and the length of
@@ -872,7 +874,7 @@ armral_status armral_cmplx_mat_vec_mult_f32(uint16_t m, uint16_t n,
  * This algorithm performs matrix-vector multiplication for a batch of
  * `M-by-N` matrices and length `N` input vectors. Each multiplication is of the
  * form `A x` for a matrix `A` and vector `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * The matrix elements are interleaved such that all elements for a particular
@@ -915,7 +917,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
  * `M-by-N` matrices and length `N` input vectors, utilizing a "pointer array"
  * storage layout for the input and output matrix batches. Each multiplication
  * is of the form `A x` for a matrix `A` and vector `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * The `p_srcs_a` parameter is an array of pointers of length `M * N`. The
@@ -1651,9 +1653,9 @@ armral_cmplx_mat_inverse_batch_f32_pa(uint32_t num_mats, uint32_t size,
  * in memory, in row-major order.
  *
  * \note
- * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * - If `m <= n` the number of rows `m` in the input matrix must be 1, 2, 3, 4,
  * 8 or 16.
- * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * - If `m > n` the number of columns `n` in the input matrix must be 1, 2, 3,
  * 4, 8 or 16.
  *
  * @param[in]  m         The number of rows in input matrix `A`.
@@ -1698,9 +1700,9 @@ armral_cmplx_pseudo_inverse_direct_f32(uint16_t m, uint16_t n, float32_t lambda,
  * in memory, in row-major order.
  *
  * \note
- * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * - If `m <= n` the number of rows `m` in the input matrix must be 1, 2, 3, 4,
  * 8 or 16.
- * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * - If `m > n` the number of columns `n` in the input matrix must be 1, 2, 3,
  * 4, 8 or 16.
  *
  * This function takes a pre-allocated buffer (`buffer`) to use internally.
@@ -4329,7 +4331,7 @@ uint32_t armral_tail_biting_convolutional_decode_block_noalloc_buffer_size(
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_svd_cf32(bool vect, int m, int n, armral_cmplx_f32_t *a,
-                              float *s, armral_cmplx_f32_t *u,
+                              float32_t *s, armral_cmplx_f32_t *u,
                               armral_cmplx_f32_t *vt);
 
 /**
@@ -4387,7 +4389,7 @@ armral_status armral_svd_cf32(bool vect, int m, int n, armral_cmplx_f32_t *a,
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_svd_cf32_noalloc(bool vect, int m, int n,
-                                      armral_cmplx_f32_t *a, float *s,
+                                      armral_cmplx_f32_t *a, float32_t *s,
                                       armral_cmplx_f32_t *u,
                                       armral_cmplx_f32_t *vt, void *buffer);
 
diff --git a/python/benchmark_excel_summary.py b/python/benchmark_excel_summary.py
new file mode 100755
index 0000000000000000000000000000000000000000..8dd3fe8fa4e13d018af416fa9e0fd92d9db6cc25
--- /dev/null
+++ b/python/benchmark_excel_summary.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# Arm RAN Acceleration Library
+# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+import argparse
+import json
+import pandas as pd
+import re
+import sys
+
+
+class UnpackedName:
+    """
+    A helper class for giving a nice sort order over benchmark names.
+    This essentially just splits the string on underscores and converts what it
+    can to integers for comparison, such that
+    e.g. decompression_9b_2 < decompression_9b_10
+    """
+
+    def __init__(self, name):
+        self.name = name
+        self.arr = re.findall(r"[^_.]+", name)
+        for i, elem in enumerate(self.arr):
+            nums = list(map(int, re.findall(r"[0-9]+", elem)))
+            self.arr[i] = (nums, elem)
+
+    def __lt__(self, other):
+        """
+        Lexographic comparison, with type-mismatches handled arbitrarily
+        """
+        for x, y in zip(self.arr, other.arr):
+            assert type(x) is type(y)
+            if x < y:
+                return True
+            if x > y:
+                return False
+        return len(self.arr) < len(other.arr)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    def __len__(self):
+        return len(self.name)
+
+    def __str__(self):
+        return self.name
+
+
+def format_headers_set_widths(worksheet, workbook, df):
+    header_format = workbook.add_format({
+        'bold': True,
+        'align': 'center',
+        'border': 1})
+
+    # Write headers
+    for col_num, header in enumerate(df.columns.values):
+        worksheet.write(0, col_num, header, header_format)
+
+    # Set width for benchmark name column
+    worksheet.set_column(0, 0, max(len(x) for x in df.name))
+
+    # Set width for data columns
+    max_width = max(len(x) for x in df.columns.values)
+    worksheet.set_column(1, 1, max_width + 2)
+
+
+def write_worksheet(writer, workbook, df):
+    name = "results"
+
+    # Write the table to an Excel worksheet, leaving room to add headers
+    df.to_excel(
+        writer, sheet_name=name, startrow=1,
+        float_format="%.2f", header=False, index=False)
+
+    # Write headers and set column widths
+    worksheet = writer.sheets[name]
+    format_headers_set_widths(worksheet, workbook, df)
+
+
+def sort_table(df):
+    # Sort by the benchmark names
+    df = df.set_index(df.name)
+    new_ind = sorted(df.index.values, key=lambda x: UnpackedName(x))
+    return df.reindex(new_ind)
+
+
+def get_json_results(src):
+    json_data = []
+    with open(src, 'rb') as f:
+        for line in f:
+            json_data.append(json.loads(line))
+
+    # Create dataframe of results
+    df = pd.json_normalize(json_data)
+
+    # Drop columns that won't be used
+    drop_cols = df.columns.difference(['name', 'median_cycles'])
+    return df.drop(drop_cols, axis=1)
+
+
+def write_workbook(src, dst):
+    # Get dataframes for data and sort by benchmark name
+    df_results = get_json_results(src)
+    df_results = sort_table(df_results)
+
+    # Create a workbook and add worksheets
+    writer = pd.ExcelWriter(dst, engine="xlsxwriter")
+    workbook = writer.book
+    write_worksheet(writer, workbook, df_results)
+
+    # Write the file
+    writer.close()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output", metavar="path",
+                        default="results.xlsx",
+                        help="specify output workbook path")
+    parser.add_argument("json_file", help="JSON benchmark results file")
+    args = parser.parse_args()
+
+    print("Writing Excel workbook to {}".format(args.output))
+    write_workbook(args.json_file, args.output)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6907a90775336029cdfc46fd4831e78a0d99aca5
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+xlsxwriter
diff --git a/simulation/CMakeLists.txt b/simulation/CMakeLists.txt
index cf7aa6241409d2e2c3a83f8cd0372d16e00fc2df..d0c5438f3dd9601f0a7f7e773e1831d349922dcb 100644
--- a/simulation/CMakeLists.txt
+++ b/simulation/CMakeLists.txt
@@ -10,67 +10,118 @@ add_custom_target(simulation)
 
 # Interface for common simulation includes
 add_library(simulation_common INTERFACE)
-target_include_directories(simulation_common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(simulation_common
+                           INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 function(set_omp_cxx_flags)
-    if (NOT OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
-        return()
-    endif()
-    check_c_compiler_flag(-fopenmp OPENMP_FLAG_IS_VALID)
-    if (OPENMP_FLAG_IS_VALID)
-        set(OpenMP_CXX_FLAGS "-fopenmp" PARENT_SCOPE)
-    else()
-        check_c_compiler_flag(-fopenmp=libomp OPENMP_FLAG_IS_VALID)
-        if (OPENMP_FLAG_IS_VALID)
-            set(OpenMP_CXX_FLAGS "-fopenmp=libomp" PARENT_SCOPE)
-        endif()
+  if(NOT OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
+    return()
+  endif()
+  check_c_compiler_flag(-fopenmp OPENMP_FLAG_IS_VALID)
+  if(OPENMP_FLAG_IS_VALID)
+    set(OpenMP_CXX_FLAGS
+        "-fopenmp"
+        PARENT_SCOPE)
+  else()
+    check_c_compiler_flag(-fopenmp=libomp OPENMP_FLAG_IS_VALID)
+    if(OPENMP_FLAG_IS_VALID)
+      set(OpenMP_CXX_FLAGS
+          "-fopenmp=libomp"
+          PARENT_SCOPE)
     endif()
+  endif()
 endfunction()
 
 find_package(OpenMP)
 find_package(Threads)
-if (Threads_FOUND)
-    # This is not the modern way of adding support for OpenMP
-    # but rather a fix for known issues when using CMake < 3.4
-    if(NOT TARGET OpenMP::OpenMP_CXX)
-        add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
-
-        set_omp_cxx_flags()
-        if (OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
-            # Sometimes we are failing to find OpenMP in testing. Needs more investigation, but in
-            # the meantime, just don't build the project
-            add_custom_target(ldpc_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(polar_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(turbo_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(convolutional_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(modulation_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            return()
-        endif()
-
-        set_property(TARGET OpenMP::OpenMP_CXX
-                     PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS})
-        # Only works if the same flag is passed to the linker; use CMake 3.9+ otherwise.
-        set_property(TARGET OpenMP::OpenMP_CXX
-                     PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS} Threads::Threads)
+if(Threads_FOUND)
+  # This is not the modern way of adding support for OpenMP but rather a fix for
+  # known issues when using CMake < 3.4
+  if(NOT TARGET OpenMP::OpenMP_CXX)
+    add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
 
+    set_omp_cxx_flags()
+    if(OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
+      # Sometimes we are failing to find OpenMP in testing. Needs more
+      # investigation, but in the meantime, just don't build the project
+      add_custom_target(
+        ldpc_awgn COMMAND cmake -E echo
+                          "OpenMP not found. Not building simulations")
+      add_custom_target(
+        polar_awgn COMMAND cmake -E echo
+                           "OpenMP not found. Not building simulations")
+      add_custom_target(
+        turbo_awgn COMMAND cmake -E echo
+                           "OpenMP not found. Not building simulations")
+      add_custom_target(
+        convolutional_awgn COMMAND cmake -E echo
+                                   "OpenMP not found. Not building simulations")
+      add_custom_target(
+        modulation_awgn COMMAND cmake -E echo
+                                "OpenMP not found. Not building simulations")
+      return()
     endif()
 
-    # Actual simulation code
-    add_subdirectory(ldpc_awgn)
-    add_subdirectory(polar_awgn)
-    add_subdirectory(turbo_awgn)
-    add_subdirectory(convolutional_awgn)
-    add_subdirectory(modulation_awgn)
-else()
+    set_property(TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_COMPILE_OPTIONS
+                                                    ${OpenMP_CXX_FLAGS})
+    # Only works if the same flag is passed to the linker; use CMake 3.9+
+    # otherwise.
+    set_property(
+      TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_LINK_LIBRARIES
+                                         ${OpenMP_CXX_FLAGS} Threads::Threads)
+  endif()
+
+  # Actual simulation code
+  function(add_armral_sim SIM_NAME SIM_CMD_LINE_OPTS)
+    set(SIM_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${SIM_NAME}/${SIM_NAME}.cpp)
+
+    set(SIM_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
+    set(SIM_COMPILER_FLAGS
+        $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+        -Wall
+        -Wcast-qual
+        -fno-rtti
+        -fno-exceptions
+        -std=c++17
+        ${OpenMP_CXX_FLAGS}>)
+
+    add_executable(${SIM_NAME} ${SIM_SOURCE})
+    target_link_libraries(${SIM_NAME} PUBLIC simulation_common armral
+                                             armral_awgn armral_utils)
+    target_link_libraries(${SIM_NAME} PRIVATE OpenMP::OpenMP_CXX)
+    target_compile_options(
+      ${SIM_NAME} PRIVATE ${SIM_COMPILE_OPTIONS} ${SIM_COMPILER_FLAGS}
+                          "$<$<CONFIG:DEBUG>:-Og>")
 
-    # If no Threads is found we simply do not create the ldpc_awgn targets.
-    # This is useful when building on bare-metal where the concept of threads
-    # does not exist but we still want to build the library.
-    MESSAGE(STATUS "Threads not found, skip ldpc_awgn target.")
+    add_dependencies(simulation ${SIM_NAME})
 
+    if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
+      # Add test for the simulation executable. At present this just checks that
+      # the executable can be successfully invoked with a set of valid inputs.
+      # We do not check the validity of the output. We also only run this if we
+      # are not using a test running wrapper.
+      add_test(NAME ${SIM_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}
+                                        ${SIM_CMD_LINE_OPTS})
+      set_tests_properties(${SIM_NAME} PROPERTIES TIMEOUT 3000)
+      add_dependencies(check ${SIM_NAME})
+    endif()
+
+  endfunction()
+
+  add_armral_sim(convolutional_awgn "-k;8;-m;0;-u;128")
+  add_armral_sim(ldpc_awgn "-z;3;-b;1;-m;0;-r;0;-u;128")
+  add_armral_sim(modulation_awgn "-k;32;-m;0;-u;128")
+  add_armral_sim(polar_awgn "-k;32;-e;32;-l;1;-m;0;-i;0;-u;128")
+  add_armral_sim(turbo_awgn "-k;40;-m;0;-i;1;-e;60")
+
+else()
+  # If no Threads is found we simply do not create the simulation targets. This
+  # is useful when building on bare-metal where the concept of threads does not
+  # exist but we still want to build the library.
+  message(STATUS "Threads not found, skip simulation target.")
 endif()
 
 if(BUILD_TESTING)
-    # Build simulation as part of the main "check" target
-    add_dependencies(check simulation)
+  # Build simulation as part of the main "check" target
+  add_dependencies(check simulation)
 endif()
diff --git a/simulation/README.md b/simulation/README.md
index 77829a405eadd841eed22520b8d15f494b21df56..8453b2a5f5a706b6950241aba77a465c1e66ff57 100644
--- a/simulation/README.md
+++ b/simulation/README.md
@@ -1,7 +1,6 @@
-Get started with ArmRAL noisy channel simulation
-================================================
+# Get started with ArmRAL noisy channel simulation
 
-# Introduction
+## Introduction
 
 This directory contains utilities and programs that you can use to evaluate the
 error-correction performance of the coding schemes provided in Arm RAN
@@ -16,9 +15,9 @@ find a mathematical description of the AWGN which is simulated. The definition
 of what is meant by bit and block error rates is then given, and we conclude with
 instructions for how to use the utilities contained in this folder.
 
-# Additive White Gaussian Noise (AWGN) Simulation
+## Additive White Gaussian Noise (AWGN) Simulation
 
-## Using simulated noise
+### Using simulated noise
 
 Noisy channels are simulated by adding noise to the symbols generated by the
 modulation routine. This simulates that a signal is sent over a noisy network.
@@ -60,7 +59,7 @@ for spectral efficiency `rho`. To calculate the spectral efficiency, the
 modulation scheme and bandwidth of the channel must be known, and passed to the
 simulation program.
 
-## Further assumptions
+### Further assumptions
 
 The simulation programs follow the description of coding and modulation schemes
 provided in 3GPP Technical Specification (TS) 36.12, Section 5.1.3 (for Turbo
@@ -75,7 +74,7 @@ and Polar coding). We make the following further assumptions:
 
 3. No Cyclic Redundancy Check (CRC) is performed.
 
-## Bit and block error rates
+### Bit and block error rates
 
 The simulator computes the error rates in terms of bits or blocks by comparing
 the input bits of encoding and the output decoded bits. The input bits are
@@ -93,7 +92,7 @@ block with at least one incorrectly decoded bit.
 
       bler = nbl / number_of_blocks
 
-# Get started with simulation programs
+## Get started with simulation programs
 
 **Note:** To compile and execute the simulation programs, you must have built
 ArmRAL with the `-DBUILD_SIMULATION=On` CMake option.
@@ -107,8 +106,9 @@ The following assumes that you are running commands from the build directory.
 The built binaries can be found in the `simulation` subdirectory of the build
 directory.
 
-In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
-`ldpc`, or `modulation` for simulations without using a coding scheme.
+In the following, `<code>` can be one of the supported coding schemes
+(`convolutional`, `ldpc`, `polar` or `turbo`). Set `<code>` to `modulation` for
+simulation without a coding scheme.
 
 * To build the AWGN channel simulation for a given coding scheme `<code>`, use:
 
@@ -116,12 +116,12 @@ In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
 
 * To run the AWGN channel simulation for `<code>` with arguments `<args>`, use:
 
-      ./simulation/<code>_awgn/<code>_awgn <args>
+      ./simulation/<code>_awgn <args>
 
 * To get a list of possible input arguments and associated documentation, use
   the same command without arguments:
 
-      ./simulation/<code>_awgn/<code>_awgn
+      ./simulation/<code>_awgn
 
 * Executing a simulation will write JSON output to stdout. The output contains
   information on the observed bit and block error rates for the input
@@ -129,7 +129,7 @@ In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
   use of the Python scripts described in the section on drawing performance
   charts.
 
-# Modulation schemes
+## Modulation schemes
 
 All simulators use modulation and demodulation, respectively, before and after
 adding noise to the channel.
@@ -153,7 +153,7 @@ the range of the generated log-likelihood ratios (LLRs). A default value for
 find that the best performance of decoding relies on a good choice of
 `<demod_ulp>`, and you are encouraged to provide a value for this parameter.
 
-# Simulation program for modulation
+## Simulation program for modulation
 
 The program `modulation_awgn` simulates the transmission of data without
 performing any forward error correction. Data is modulated, then has
@@ -179,7 +179,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-# Simulation programs for individual coding schemes
+## Simulation programs for individual coding schemes
 
 In this section, we give the definition of some parameters used in the programs
 associated with each coding scheme.
@@ -189,12 +189,13 @@ help text use
 
       <sim_name> --help
 
-where `<sim_name>` is one of `polar_awgn`, `turbo_awgn`, or `ldpc_awgn`. The
-help text of the programs gives more detailed descriptions on the parameters
-than you will find in the sections below. The information below helps you to
-run the simulation programs and understand their output.
+where `<sim_name>` is one of `polar_awgn`, `turbo_awgn`, `ldpc_awgn`, or
+`convolutional_awgn`. The help text of the programs gives more detailed
+descriptions on the parameters than you will find in the sections below. The
+information below helps you to run the simulation programs and understand their
+output.
 
-## Polar Codes
+### Polar Codes
 
 You can run the `polar` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -219,7 +220,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Turbo Codes
+### Turbo Codes
 
 You can run the `turbo` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -241,7 +242,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Low-Density Parity-Check (LDPC) Codes
+### Low-Density Parity-Check (LDPC) Codes
 
 You can run the `LDPC` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -265,7 +266,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Tail-biting Convolutional Codes
+### Tail-biting Convolutional Codes
 
 You can run the `convolutional` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -286,7 +287,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-# Drawing performance charts
+## Drawing performance charts
 
 The simulator allows users to evaluate the performance of a coding scheme. In
 the context of noisy channels, performance is evaluated in terms of output
@@ -318,7 +319,7 @@ scripts requires a recent version of Python. ArmRAL has been tested with Python
 
       ./simulation/<code>_awgn/<code>_error_rate.py --help
 
-# Drawing capacity charts
+## Drawing capacity charts
 
 The simulator allows users to draw the data rates of each modulation and compare
 them to the capacity of the AWGN channel (the Shannon limit).
diff --git a/simulation/awgn/CMakeLists.txt b/simulation/awgn/CMakeLists.txt
index be9a7cac443c191a818886e7c3361e8d3e8f906a..b5f552237f6ee944e898690552ce22bbdd52ec24 100644
--- a/simulation/awgn/CMakeLists.txt
+++ b/simulation/awgn/CMakeLists.txt
@@ -1,19 +1,15 @@
 cmake_minimum_required(VERSION 3.3)
 project(awgn VERSION 0.0)
 
-set(AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/awgn.cpp
-)
+set(AWGN_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/awgn.cpp)
 
 set(AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17>)
+set(AWGN_COMPILER_FLAGS $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual
+                        -fno-rtti -fno-exceptions -std=c++17>)
 
 add_library(armral_awgn ${AWGN_SOURCES})
 target_include_directories(armral_awgn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(armral_awgn PUBLIC armral armral_utils)
-target_compile_options(armral_awgn PRIVATE
-  ${AWGN_COMPILE_OPTIONS}
-  ${AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
+target_compile_options(
+  armral_awgn PRIVATE ${AWGN_COMPILE_OPTIONS} ${AWGN_COMPILER_FLAGS}
+                      "$<$<CONFIG:DEBUG>:-Og>")
diff --git a/simulation/awgn/awgn.cpp b/simulation/awgn/awgn.cpp
index 6ae035c19b73e4ad16a91ec481d6a787b9ee55db..cfd76520693009934287b74c2c9090a2423b33d7 100644
--- a/simulation/awgn/awgn.cpp
+++ b/simulation/awgn/awgn.cpp
@@ -1,18 +1,20 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
-#include "awgn.h"
+#include "awgn.hpp"
 #include "rng.hpp"
 
 #include <algorithm>
 #include <cmath>
 
+namespace armral::simulation {
+
 /*
- * Return a random doubleing-point number in the range [-1, 1).
+ * Return a random double floating-point number in the range [-1, 1).
  */
-static double sample_double_unit(random_state *state) {
-  linear_congruential_generator lcg;
+static double sample_double_unit(armral::utils::random_state *state) {
+  armral::utils::linear_congruential_generator lcg;
   return lcg.one<double>(state) * 2.0 - 1.0;
 }
 
@@ -20,7 +22,7 @@ static double sample_double_unit(random_state *state) {
  * Return a number taken from a normal distribution with mean=0 and
  * the specified stddev.
  */
-static double sample_normal(random_state *state, double sigma) {
+static double sample_normal(armral::utils::random_state *state, double sigma) {
   double u;
   double r;
   do {
@@ -38,8 +40,9 @@ static double sample_normal(random_state *state, double sigma) {
  * decibels, and deduce the random field required to produce noise of the
  * appropriate power (mean square amplitude). This noise is added to the signal.
  */
-void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
-              armral_fixed_point_index frac_bits, armral_cmplx_int16_t *xs) {
+void add_awgn(armral::utils::random_state *state, int num_mod_symbols,
+              double snr_db, armral_fixed_point_index frac_bits,
+              armral_cmplx_int16_t *xs) {
 
   //    snr_db = 10 * log_10(s / r)
   // => r = 10^(-snr/10)
@@ -92,3 +95,5 @@ double ebn0_to_snr(double coding_rate, int bits_per_symb, double symb_rate,
 
   return snr_db;
 }
+
+} // namespace armral::simulation
diff --git a/simulation/awgn/awgn.h b/simulation/awgn/awgn.hpp
similarity index 82%
rename from simulation/awgn/awgn.h
rename to simulation/awgn/awgn.hpp
index ad8c8fe397fda55260d459326c415bbeb521353f..ba978e80e4d2848148aff94701d4beb96dc87917 100644
--- a/simulation/awgn/awgn.h
+++ b/simulation/awgn/awgn.hpp
@@ -1,12 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include "armral.h"
 #include "rng.hpp"
 
+namespace armral::simulation {
+
 /*
  * Add noise to a channel where it is assumed that the power (mean square
  * amplitude) of the signal is 1. We target a specific signal to noise ratio in
@@ -21,8 +23,9 @@
  * param [in,out] xs              On input, the signal to add noise to. On
  *                                output the signal, disturbed by random noise.
  */
-void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
-              armral_fixed_point_index frac_bits, armral_cmplx_int16_t *xs);
+void add_awgn(armral::utils::random_state *state, int num_mod_symbols,
+              double snr_db, armral_fixed_point_index frac_bits,
+              armral_cmplx_int16_t *xs);
 
 /*
  * Compute the SNR in db given the coding rate, the bits per symbol, the
@@ -37,3 +40,5 @@ void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
  */
 double ebn0_to_snr(double coding_rate, int bits_per_symb, double symb_rate,
                    double bw, double ebn0_db);
+
+} // namespace armral::simulation
diff --git a/simulation/capacity/capacity.py b/simulation/capacity/capacity.py
index 4b331ad5394788908ee210582e3adac946e5687c..496028fa5871daf154d929075c55c2164ff7f808 100755
--- a/simulation/capacity/capacity.py
+++ b/simulation/capacity/capacity.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from math import sqrt, exp, pi, log
diff --git a/simulation/convolutional_awgn/CMakeLists.txt b/simulation/convolutional_awgn/CMakeLists.txt
deleted file mode 100644
index 14faf79fac024d101910d71acf49da3bfd64c8d8..0000000000000000000000000000000000000000
--- a/simulation/convolutional_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(convolutional_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(CONV_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/convolutional_awgn.cpp
-)
-
-set(CONV_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(CONV_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(convolutional_awgn ${CONV_AWGN_SOURCES})
-target_link_libraries(convolutional_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(convolutional_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(convolutional_awgn PRIVATE
-  ${CONV_AWGN_COMPILE_OPTIONS}
-  ${CONV_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation convolutional_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME convolutional_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/convolutional_awgn -k 8 -m 0 -u 128)
-  set_tests_properties(convolutional_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check convolutional_awgn)
-endif()
diff --git a/simulation/convolutional_awgn/convolutional_awgn.cpp b/simulation/convolutional_awgn/convolutional_awgn.cpp
index 294b5b30d0de0cdf07412bb82498d90e4c920694..6c74c89c343cf6a0cbf9c8eb8f25752e35d8c9f1 100644
--- a/simulation/convolutional_awgn/convolutional_awgn.cpp
+++ b/simulation/convolutional_awgn/convolutional_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -28,7 +28,7 @@ void usage(const char *exe_name) {
       << "The arguments required by " << exe_name << " are:\n\n"
       << "  <num_bits>  Number of bits in the encoded message.\n"
       << "  <mod_type>  Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(2)
+      << armral::simulation::print_valid_mod_type(2)
       << "  <demod_ulp> Scaling parameter used in demodulation when\n"
       << "              using fixed-point Q2.13 representation for symbols.\n"
       << "              <demod_ulp> is an integer.\n"
@@ -64,7 +64,7 @@ struct convolutional_example_data {
 
   convolutional_example_data(uint32_t k, armral_modulation_type mod) {
     mod_type = mod;
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     len_in = k;
     len_encoded = k;
     len_out = k;
@@ -108,13 +108,13 @@ struct convolutional_example_data {
 
 // Perform an end-to-end encoding, modulation, transmission, demodulation, and
 // decoding and count the number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               uint32_t iter_max, convolutional_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -137,12 +137,12 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data2_encoded, data->data2_mod);
 
   // AWGN channel effects - add some noise to all the encoded bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data0_mod);
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data1_mod);
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data2_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data0_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data1_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data2_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -175,7 +175,7 @@ struct sim_result {
   sim_result(uint32_t k_in, armral_modulation_type mod, uint32_t ulp_in,
              double ebn0_in, double snr_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages, uint32_t iter_max_in)
-    : k(k_in), mod_type(armral_simulation::mod_to_str(mod)), ulp(ulp_in),
+    : k(k_in), mod_type(armral::simulation::mod_to_str(mod)), ulp(ulp_in),
       ebn0(ebn0_in), snr(snr_in), bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)), iter_max(iter_max_in) {
   }
@@ -204,7 +204,7 @@ struct sim_result {
 bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
              uint16_t ulp, double ebn0_db) {
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding ratio (k/n) of the LTE convolutional codes
   // is 1/3, see 3GPP TS 36.212
   double coding_rate = 1.0 / 3.0;
@@ -214,8 +214,8 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -228,7 +228,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
       convolutional_example_data data(k, mod_type);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, snr_db, ulp, iter_max, &data);
         nb += num_bit_errors;
@@ -253,7 +253,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   uint16_t ulp = 0;
   uint32_t iter_max = 0;
@@ -292,10 +292,10 @@ int main(int argc, char **argv) {
         << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/convolutional_awgn/convolutional_error_rate.py b/simulation/convolutional_awgn/convolutional_error_rate.py
index 1bd7e713311b1b1984b004ab2c48530acb1a2276..0a887e61b72c49b015fcaf6d433b7b79e082d1ce 100755
--- a/simulation/convolutional_awgn/convolutional_error_rate.py
+++ b/simulation/convolutional_awgn/convolutional_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/include/simulation_common.hpp b/simulation/include/simulation_common.hpp
index ecced32b5c346f671d64f9903f3f44fabb76ffbd..3a36b98a02cf1ce14acb89ade92162a784061b8c 100644
--- a/simulation/include/simulation_common.hpp
+++ b/simulation/include/simulation_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -10,7 +10,7 @@
 
 #define SNEW(T, sz) ((T *)calloc(sz, sizeof(T)))
 
-namespace armral_simulation {
+namespace armral::simulation {
 
 uint8_t bits_per_symbol(armral_modulation_type mod_type) {
   uint8_t nb_bits;
@@ -72,4 +72,4 @@ std::string print_valid_mod_type(int num_tabs) {
   return os.str();
 }
 
-} // namespace armral_simulation
+} // namespace armral::simulation
diff --git a/simulation/include/simulation_common.py b/simulation/include/simulation_common.py
index f062f40ea97b363385cd3bbc917c61796de4b509..1500149d6ff5ba41f8e4082326878cc7fda44912 100755
--- a/simulation/include/simulation_common.py
+++ b/simulation/include/simulation_common.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from dataclasses import dataclass
 from datetime import datetime
diff --git a/simulation/ldpc_awgn/CMakeLists.txt b/simulation/ldpc_awgn/CMakeLists.txt
deleted file mode 100644
index bfba1437d22f00a32ec3b162b4c6a911d2f4fd34..0000000000000000000000000000000000000000
--- a/simulation/ldpc_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(ldpc_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(LDPC_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/ldpc_awgn.cpp
-)
-
-set(LDPC_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(LDPC_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(ldpc_awgn ${LDPC_AWGN_SOURCES})
-target_link_libraries(ldpc_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(ldpc_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(ldpc_awgn PRIVATE
-  ${LDPC_AWGN_COMPILE_OPTIONS}
-  ${LDPC_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation ldpc_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME ldpc_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ldpc_awgn -z 3 -b 1 -m 0 -r 0 -u 128)
-  set_tests_properties(ldpc_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check ldpc_awgn)
-endif()
diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp
index 9df77ae94b85c3fcae456d9e42afb8756da7eb9c..19db5f7c3e0cbd01e9fdb056760f43456ef3b308 100644
--- a/simulation/ldpc_awgn/ldpc_awgn.cpp
+++ b/simulation/ldpc_awgn/ldpc_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -91,7 +91,7 @@ void usage(const char *exe_name) {
       << "                       values are:\n"
       << "                       " << print_valid_base_graph()
       << "  <mod_type>           Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <redundancy_version> The redundancy version to be used. Supported\n"
       << "                       values are:\n"
       << "                       " << print_valid_redundancy_version()
@@ -142,7 +142,7 @@ struct ldpc_example_data {
     data_in_bytes = SNEW(uint8_t, 2 * z);
     data_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     data_encoded_bytes = SNEW(uint8_t, len_encoded);
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     nref = 0;
     len_rate_matched =
         bit_per_symbol * ((len_encoded + bit_per_symbol - 1) / bit_per_symbol);
@@ -169,14 +169,14 @@ struct ldpc_example_data {
   }
 };
 
-int run_check(random_state *state, uint32_t z, armral_ldpc_graph_t bg,
-              uint32_t rv, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, uint32_t z,
+              armral_ldpc_graph_t bg, uint32_t rv, double snr_db, uint32_t ulp,
               ldpc_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < (data->len_in - data->len_filler_bits); ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -204,8 +204,8 @@ int run_check(random_state *state, uint32_t z, armral_ldpc_graph_t bg,
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -255,9 +255,9 @@ struct sim_result {
              armral_modulation_type mod, uint32_t rv_in, double ebn0_in,
              double snr_in, uint16_t ulp_in, uint16_t filler_bits_len,
              uint32_t nb, uint32_t nm, uint32_t num_messages)
-    : n(n_in), bg((int)bg_in + 1), mod_type(armral_simulation::mod_to_str(mod)),
-      rv(rv_in), ebn0(ebn0_in), snr(snr_in), ulp(ulp_in),
-      len_filler_bits(filler_bits_len),
+    : n(n_in), bg((int)bg_in + 1),
+      mod_type(armral::simulation::mod_to_str(mod)), rv(rv_in), ebn0(ebn0_in),
+      snr(snr_in), ulp(ulp_in), len_filler_bits(filler_bits_len),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * n_in)) {}
 
@@ -290,7 +290,7 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
              uint16_t len_filler_bits, double ebn0_db) {
   const auto *graph = armral_ldpc_get_base_graph(bg);
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding rate (k/n) for LDPC base graph 1 is 1/3 (k = 22 Z, n = 66 Z)
   // and 1/5 for LDPC base graph 2 (k = 10 Z, n = 50 Z), see 3GPP TS 38.212
   double coding_rate;
@@ -306,8 +306,8 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -321,7 +321,7 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
       ldpc_example_data data(z, mod_type, graph, len_filler_bits);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, z, bg, rv, snr_db, ulp, &data);
         nb += num_bit_errors;
@@ -405,10 +405,10 @@ int main(int argc, char **argv) {
               << "\t" << print_valid_base_graph() << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/ldpc_awgn/ldpc_error_rate.py b/simulation/ldpc_awgn/ldpc_error_rate.py
index 32e75e3c50a46595edc01cdd3ccc934193f1c9ab..0eb6643a35d12bf0601f9519f3860dd46223e3c7 100755
--- a/simulation/ldpc_awgn/ldpc_error_rate.py
+++ b/simulation/ldpc_awgn/ldpc_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/modulation_awgn/CMakeLists.txt b/simulation/modulation_awgn/CMakeLists.txt
deleted file mode 100644
index c30886adef14827a1b97aac820c7195ed0b345b1..0000000000000000000000000000000000000000
--- a/simulation/modulation_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(modulation_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(MODULATION_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/modulation_awgn.cpp
-)
-
-set(MODULATION_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(MODULATION_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(modulation_awgn ${MODULATION_AWGN_SOURCES})
-target_link_libraries(modulation_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(modulation_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(modulation_awgn PRIVATE
-  ${MODULATION_AWGN_COMPILE_OPTIONS}
-  ${MODULATION_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation modulation_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME modulation_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/modulation_awgn -k 32 -m 0 -u 128)
-  set_tests_properties(modulation_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check modulation_awgn)
-endif()
diff --git a/simulation/modulation_awgn/modulation_awgn.cpp b/simulation/modulation_awgn/modulation_awgn.cpp
index 426c90e10994ad86c1fedb61e3ce3ae08731df17..bd493e5bdbbec7a5ace907860c788acb1ad97ddb 100644
--- a/simulation/modulation_awgn/modulation_awgn.cpp
+++ b/simulation/modulation_awgn/modulation_awgn.cpp
@@ -1,8 +1,8 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -18,7 +18,7 @@ void usage(const char *exe_name) {
       << "The arguments required by " << exe_name << " are:\n\n"
       << "  <num_bits>    Number of bits in the encoded message.\n"
       << "  <mod_type>    Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(2)
+      << armral::simulation::print_valid_mod_type(2)
       << "  <demod_ulp> Scaling parameter used in demodulation when\n"
       << "              using fixed-point Q2.13 representation for symbols.\n"
       << "              <demod_ulp> is an integer such that the symbol\n"
@@ -40,7 +40,7 @@ struct example_data {
 
   example_data(uint32_t k, armral_modulation_type mod) {
     mod_type = mod;
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     len_in = k;
     num_mod_symbols = (len_in + bit_per_symbol - 1) / bit_per_symbol;
     data_in = SNEW(uint8_t, (len_in + 7) / 8);
@@ -59,13 +59,13 @@ struct example_data {
 
 // Perform an end-to-end modulation, transmission, demodulation, and count the
 // number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -79,8 +79,8 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data_in, data->data_mod);
 
   // AWGN channel effects - add some noise to all the modulated bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -102,7 +102,7 @@ struct sim_result {
   sim_result(uint32_t k_in, armral_modulation_type mod, uint32_t ulp_in,
              double ebn0_in, double snr_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages)
-    : k(k_in), mod_type(armral_simulation::mod_to_str(mod)), ulp(ulp_in),
+    : k(k_in), mod_type(armral::simulation::mod_to_str(mod)), ulp(ulp_in),
       ebn0(ebn0_in), snr(snr_in), bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
 
@@ -129,7 +129,7 @@ struct sim_result {
 bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
              double ebn0_db) {
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding rate is used to convert from Eb/N0 to SNR. This program doesn't
   // use a coding scheme, so the number of output bits is equal to the number of
   // input bits.
@@ -140,8 +140,8 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -154,7 +154,7 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
       example_data data(k, mod_type);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors = run_check(&state, snr_db, ulp, &data);
         nb += num_bit_errors;
         num_message_errors += num_bit_errors == 0 ? 0 : 1;
@@ -178,7 +178,7 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   uint16_t ulp = 0;
   armral_modulation_type mod_type = ARMRAL_MOD_256QAM;
@@ -212,10 +212,10 @@ int main(int argc, char **argv) {
               << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/modulation_awgn/modulation_error_rate.py b/simulation/modulation_awgn/modulation_error_rate.py
index c5b72ea749de781fe3123dbeaed539e8cf87547b..14ff20cdf4abd42d5c95f83f855b041c7a435567 100755
--- a/simulation/modulation_awgn/modulation_error_rate.py
+++ b/simulation/modulation_awgn/modulation_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 import pandas as pd
diff --git a/simulation/polar_awgn/CMakeLists.txt b/simulation/polar_awgn/CMakeLists.txt
deleted file mode 100644
index 249b4bc7200dd411a909612b8c28ebb47615bafc..0000000000000000000000000000000000000000
--- a/simulation/polar_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(polar_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(POLAR_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/polar_awgn.cpp
-)
-
-set(POLAR_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(POLAR_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(polar_awgn ${POLAR_AWGN_SOURCES})
-target_link_libraries(polar_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(polar_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(polar_awgn PRIVATE
-  ${POLAR_AWGN_COMPILE_OPTIONS}
-  ${POLAR_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation polar_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME polar_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/polar_awgn -k 32 -e 32 -l 1 -m 0 -i 0 -u 128)
-  set_tests_properties(polar_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check polar_awgn)
-endif()
diff --git a/simulation/polar_awgn/polar_awgn.cpp b/simulation/polar_awgn/polar_awgn.cpp
index 76241f0d2c5b677129c5ecdb524be4f37a4a4596..c25aa415e33d47da52216c4d1fdeaea1c90eebb1 100644
--- a/simulation/polar_awgn/polar_awgn.cpp
+++ b/simulation/polar_awgn/polar_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -56,7 +56,7 @@ void usage(const char *exe_name) {
       << "                    this program expects <num_trans_bits> greater\n"
       << "                    than or equal to <num_info_bits>.\n"
       << "  <mod_type>        Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <i_bil>           Flag to enable/disable the interleaving of \n"
       << "                    coded bits in Polar rate matching.\n"
       << "                    Type 0 : Downlink, Type 1 : Uplink.\n"
@@ -129,7 +129,7 @@ struct polar_example_data {
     i_bil = i_bil_in;
     mod_type = mod;
     demod_ulp = demod_ulp_in;
-    bits_per_mod_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bits_per_mod_symbol = armral::simulation::bits_per_symbol(mod_type);
     // We want to encode k bits of data
     data_in = SNEW(uint8_t, (k + 7) / 8);
     // Codeword length is n, which is what we interleave
@@ -171,15 +171,16 @@ struct polar_example_data {
   }
 };
 
-int run_check(random_state *state, double snr_db, polar_example_data *data) {
+int run_check(armral::utils::random_state *state, double snr_db,
+              polar_example_data *data) {
 
   uint32_t crc_bits = 24;                 // CRC-24 (L = 24)
   uint32_t msg_bits = data->k - crc_bits; // message length (A = K - L)
 
   std::vector<uint8_t> msg((msg_bits + 7) / 8);
   for (uint32_t i = 0; i < msg_bits; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -216,8 +217,8 @@ int run_check(random_state *state, double snr_db, polar_example_data *data) {
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   armral_demodulation(data->num_mod_symbols, data->demod_ulp, data->mod_type,
                       data->data_mod, data->data_demod_soft);
@@ -271,7 +272,7 @@ struct sim_result {
              uint16_t ulp_in, double ebn0_in, double snr_in, uint32_t nb,
              uint32_t nm, uint32_t num_messages)
     : len(n), e(e_in), k(k_in), l(l_in),
-      mod_type(armral_simulation::mod_to_str(mod)), i_bil(i_bil_in),
+      mod_type(armral::simulation::mod_to_str(mod)), i_bil(i_bil_in),
       ulp(ulp_in), ebn0(ebn0_in), snr(snr_in),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
@@ -306,7 +307,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
              uint16_t ulp, double ebn0_db) {
   uint32_t n = get_codeword_length(e, k);
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   double coding_rate = (double)k / n;
   double bw = 1e6; // Bandwidth (B) = 1 MHz
   // The symbol rate R [symbols/s] is proportional to the bandwidth. For
@@ -314,8 +315,8 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   int nb = 0;
   uint64_t nr_total = 0;
@@ -327,7 +328,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
       polar_example_data data(n, e, k, l, i_bil, mod_type, ulp);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors = run_check(&state, snr_db, &data);
         nb += num_bit_errors;
         num_message_errors += num_bit_errors == 0 ? 0 : 1;
@@ -348,7 +349,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   bool is_k_set = false;
   uint32_t e = 0;
@@ -408,10 +409,10 @@ int main(int argc, char **argv) {
               << "\t" << print_valid_l() << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
   if (!is_i_bil_set) {
diff --git a/simulation/polar_awgn/polar_error_rate.py b/simulation/polar_awgn/polar_error_rate.py
index 5cd42341afe8cde5e049fc6032c2e98b4f76325a..f8a76cbf7fdb05a8b9ac16250998fb78f66e94a4 100755
--- a/simulation/polar_awgn/polar_error_rate.py
+++ b/simulation/polar_awgn/polar_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/turbo_awgn/CMakeLists.txt b/simulation/turbo_awgn/CMakeLists.txt
deleted file mode 100644
index 0f6389a025f8132fa6b9be906671a3a4d274fe7b..0000000000000000000000000000000000000000
--- a/simulation/turbo_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(turbo_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(TURBO_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/turbo_awgn.cpp
-)
-
-set(TURBO_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(TURBO_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(turbo_awgn ${TURBO_AWGN_SOURCES})
-target_link_libraries(turbo_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(turbo_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(turbo_awgn PRIVATE
-  ${TURBO_AWGN_COMPILE_OPTIONS}
-  ${TURBO_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation turbo_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME turbo_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/turbo_awgn -k 40 -m 0 -i 1 -e 60)
-  set_tests_properties(turbo_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check turbo_awgn)
-endif()
diff --git a/simulation/turbo_awgn/turbo_awgn.cpp b/simulation/turbo_awgn/turbo_awgn.cpp
index 29c8bdb40c92f75ff245a0526507e51792342302..7dbc27114dbe92da9cb10fee01df91d42ebd6797 100644
--- a/simulation/turbo_awgn/turbo_awgn.cpp
+++ b/simulation/turbo_awgn/turbo_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -90,7 +90,7 @@ void usage(const char *exe_name) {
       << "                     This must be one of:\n"
       << print_valid_block_size("\t\t\t")
       << "  <mod_type>         Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <num_matched_bits> Number of bits in the rate-matched message.\n"
       << "  <rv>               The redundancy version used for rate matching\n"
       << "                     and recovery. Supported values are:\n"
@@ -145,7 +145,7 @@ struct turbo_example_data {
     par_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     itl_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     data_matched = SNEW(uint8_t, (len_matched + 7) / 8);
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     num_mod_symbols = (len_matched + bit_per_symbol - 1) / bit_per_symbol;
     data_mod = SNEW(armral_cmplx_int16_t, num_mod_symbols);
     sys_recovered = SNEW(int8_t, len_encoded);
@@ -175,13 +175,13 @@ struct turbo_example_data {
 
 // Perform an end-to-end encoding, rate matching, modulation, transmission,
 // demodulation, rate recovery, and decoding and count the number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               uint32_t iter_max, turbo_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -206,8 +206,8 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise to all the encoded bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -251,7 +251,7 @@ struct sim_result {
              uint32_t ulp_in, double ebn0_in, double snr_in,
              uint32_t iter_max_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages)
-    : k(k_in), e(e_in), mod_type(armral_simulation::mod_to_str(mod)),
+    : k(k_in), e(e_in), mod_type(armral::simulation::mod_to_str(mod)),
       ulp(ulp_in), ebn0(ebn0_in), snr(snr_in), iter_max(iter_max_in),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
@@ -284,15 +284,15 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
   // The coding rate is the ratio of input information bits, k, to the number of
   // rate-matched bits, e.
   double coding_rate = (double)k / e;
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   double bw = 1e6; // Bandwidth (B) = 1 MHz
   // The symbol rate R [symbols/s] is proportional to the bandwidth. For
   // passband transmission using QAM modulation the maximum spectral efficiency
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -305,7 +305,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
       turbo_example_data data(k, mod_type, e, rv);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, snr_db, ulp, iter_max, &data);
         nb += num_bit_errors;
@@ -379,10 +379,10 @@ int main(int argc, char **argv) {
               << print_valid_block_size("\t") << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
   if (!is_e_set) {
diff --git a/simulation/turbo_awgn/turbo_error_rate.py b/simulation/turbo_awgn/turbo_error_rate.py
index 6725cdf09f8f6d7d8c46aecc59db1d7a70ec3329..51cd9fdf25171523755ddbdd2b41012c6aac8268 100755
--- a/simulation/turbo_awgn/turbo_error_rate.py
+++ b/simulation/turbo_awgn/turbo_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
index 5c889e12542a41f1965dfb83e42d9e57c5256a21..c3941c2287f55f7fb9a6292fe7157114e585049e 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -407,11 +407,11 @@ void invert_hermitian_matrix<3>(const armral_cmplx_f32_t *__restrict p_src,
   armral_cmplx_f32_t adj21 = {adj12.re, -adj12.im};
 
   // Determinant (real): A_{0:} * adj(A)_{:0}
-  float det00 = a00.re * adj00.re;
-  float det11 = scal_mul_cmplx_f32(a01, adj10).re;
-  float det22 = scal_mul_cmplx_f32(a02, adj20).re;
+  float32_t det00 = a00.re * adj00.re;
+  float32_t det11 = scal_mul_cmplx_f32(a01, adj10).re;
+  float32_t det22 = scal_mul_cmplx_f32(a02, adj20).re;
 
-  float inv_det = 1.0F / (det00 + det11 + det22);
+  float32_t inv_det = 1.0F / (det00 + det11 + det22);
 
   // Write into output array
   p_dst[0] = {adj00.re * inv_det, adj00.im * inv_det};
@@ -646,8 +646,8 @@ invert_batch_hermitian_matrix_3x3(uint32_t num_mats,
 
   assert(num_mats % 4 == 0);
 
-  const float *src = (const float *)p_src;
-  float *dst = (float *)p_dst;
+  const float32_t *src = (const float32_t *)p_src;
+  float32_t *dst = (float32_t *)p_dst;
   uint32_t stride = num_mats * 2;
 
   for (uint32_t mat_i = 0; mat_i + 4 <= num_mats; mat_i += 4) {
@@ -783,22 +783,22 @@ invert_batch_hermitian_matrix_3x3_pa(
 
   assert(num_mats % 4 == 0);
 
-  const float *src_00 = (const float *)p_srcs[0];
-  const float *src_01 = (const float *)p_srcs[1];
-  const float *src_02 = (const float *)p_srcs[2];
-  const float *src_11 = (const float *)p_srcs[4];
-  const float *src_12 = (const float *)p_srcs[5];
-  const float *src_22 = (const float *)p_srcs[8];
-
-  float *dst_00 = (float *)p_dsts[0];
-  float *dst_01 = (float *)p_dsts[1];
-  float *dst_02 = (float *)p_dsts[2];
-  float *dst_10 = (float *)p_dsts[3];
-  float *dst_11 = (float *)p_dsts[4];
-  float *dst_12 = (float *)p_dsts[5];
-  float *dst_20 = (float *)p_dsts[6];
-  float *dst_21 = (float *)p_dsts[7];
-  float *dst_22 = (float *)p_dsts[8];
+  const float32_t *src_00 = (const float32_t *)p_srcs[0];
+  const float32_t *src_01 = (const float32_t *)p_srcs[1];
+  const float32_t *src_02 = (const float32_t *)p_srcs[2];
+  const float32_t *src_11 = (const float32_t *)p_srcs[4];
+  const float32_t *src_12 = (const float32_t *)p_srcs[5];
+  const float32_t *src_22 = (const float32_t *)p_srcs[8];
+
+  float32_t *dst_00 = (float32_t *)p_dsts[0];
+  float32_t *dst_01 = (float32_t *)p_dsts[1];
+  float32_t *dst_02 = (float32_t *)p_dsts[2];
+  float32_t *dst_10 = (float32_t *)p_dsts[3];
+  float32_t *dst_11 = (float32_t *)p_dsts[4];
+  float32_t *dst_12 = (float32_t *)p_dsts[5];
+  float32_t *dst_20 = (float32_t *)p_dsts[6];
+  float32_t *dst_21 = (float32_t *)p_dsts[7];
+  float32_t *dst_22 = (float32_t *)p_dsts[8];
 
 #if ARMRAL_ARCH_SVE >= 2
   for (uint32_t mat_i = 0; mat_i < num_mats; mat_i += 4) {
@@ -1357,7 +1357,7 @@ static void sve_invert_hermitian_matrix4x4(
 
   svfloat32_t z = svdup_n_f32(0);
 
-  // Enable the compiler to optimise away loading c0 and c1.
+  // Enable the compiler to optimize away loading c0 and c1.
   sve_mat_conj_tran_2x2(b0, b1, &c0, &c1);
 
   svfloat32_t a_inv0;
@@ -1407,7 +1407,7 @@ void invert_hermitian_matrix<4>(const armral_cmplx_f32_t *__restrict p_src,
 #ifdef ARMRAL_ARCH_SVE
   svbool_t p4 = svptrue_pat_b32(SV_VL4);
 
-  const float *src = (const float *)p_src;
+  const float32_t *src = (const float32_t *)p_src;
   svfloat32_t a0 = svld1_f32(p4, &src[0 * 4]);
   svfloat32_t b0 = svld1_f32(p4, &src[1 * 4]);
   svfloat32_t a1 = svld1_f32(p4, &src[2 * 4]);
@@ -1450,7 +1450,7 @@ void invert_hermitian_matrix<4>(const armral_cmplx_f32_t *__restrict p_src,
   float32x4x2_t c;
   float32x4x2_t d;
 
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
 
   // Fill sub-blocks matrix
   a.val[FIRST_ROW] = vld1q_f32(p_mat);
@@ -1857,8 +1857,8 @@ invert_batch_hermitian_matrix_4x4(uint32_t num_mats,
                                   const armral_cmplx_f32_t *__restrict p_src,
                                   armral_cmplx_f32_t *p_dst) {
 
-  const float *src = (const float *)p_src;
-  float *dst = (float *)p_dst;
+  const float32_t *src = (const float32_t *)p_src;
+  float32_t *dst = (float32_t *)p_dst;
   uint32_t stride = num_mats * 2;
 
 #if ARMRAL_ARCH_SVE >= 2
@@ -2097,39 +2097,39 @@ invert_batch_hermitian_matrix_4x4_pa(
     uint32_t num_mats, const armral_cmplx_f32_t *__restrict *__restrict p_srcs,
     armral_cmplx_f32_t *__restrict *__restrict p_dsts) {
 
-  const float *src_00 = (const float *)p_srcs[0];
-  const float *src_01 = (const float *)p_srcs[1];
-  const float *src_02 = (const float *)p_srcs[2];
-  const float *src_03 = (const float *)p_srcs[3];
-  const float *src_10 = (const float *)p_srcs[4];
-  const float *src_11 = (const float *)p_srcs[5];
-  const float *src_12 = (const float *)p_srcs[6];
-  const float *src_13 = (const float *)p_srcs[7];
-  const float *src_20 = (const float *)p_srcs[8];
-  const float *src_21 = (const float *)p_srcs[9];
-  const float *src_22 = (const float *)p_srcs[10];
-  const float *src_23 = (const float *)p_srcs[11];
-  const float *src_30 = (const float *)p_srcs[12];
-  const float *src_31 = (const float *)p_srcs[13];
-  const float *src_32 = (const float *)p_srcs[14];
-  const float *src_33 = (const float *)p_srcs[15];
-
-  float *dst_00 = (float *)p_dsts[0];
-  float *dst_01 = (float *)p_dsts[1];
-  float *dst_02 = (float *)p_dsts[2];
-  float *dst_03 = (float *)p_dsts[3];
-  float *dst_10 = (float *)p_dsts[4];
-  float *dst_11 = (float *)p_dsts[5];
-  float *dst_12 = (float *)p_dsts[6];
-  float *dst_13 = (float *)p_dsts[7];
-  float *dst_20 = (float *)p_dsts[8];
-  float *dst_21 = (float *)p_dsts[9];
-  float *dst_22 = (float *)p_dsts[10];
-  float *dst_23 = (float *)p_dsts[11];
-  float *dst_30 = (float *)p_dsts[12];
-  float *dst_31 = (float *)p_dsts[13];
-  float *dst_32 = (float *)p_dsts[14];
-  float *dst_33 = (float *)p_dsts[15];
+  const float32_t *src_00 = (const float32_t *)p_srcs[0];
+  const float32_t *src_01 = (const float32_t *)p_srcs[1];
+  const float32_t *src_02 = (const float32_t *)p_srcs[2];
+  const float32_t *src_03 = (const float32_t *)p_srcs[3];
+  const float32_t *src_10 = (const float32_t *)p_srcs[4];
+  const float32_t *src_11 = (const float32_t *)p_srcs[5];
+  const float32_t *src_12 = (const float32_t *)p_srcs[6];
+  const float32_t *src_13 = (const float32_t *)p_srcs[7];
+  const float32_t *src_20 = (const float32_t *)p_srcs[8];
+  const float32_t *src_21 = (const float32_t *)p_srcs[9];
+  const float32_t *src_22 = (const float32_t *)p_srcs[10];
+  const float32_t *src_23 = (const float32_t *)p_srcs[11];
+  const float32_t *src_30 = (const float32_t *)p_srcs[12];
+  const float32_t *src_31 = (const float32_t *)p_srcs[13];
+  const float32_t *src_32 = (const float32_t *)p_srcs[14];
+  const float32_t *src_33 = (const float32_t *)p_srcs[15];
+
+  float32_t *dst_00 = (float32_t *)p_dsts[0];
+  float32_t *dst_01 = (float32_t *)p_dsts[1];
+  float32_t *dst_02 = (float32_t *)p_dsts[2];
+  float32_t *dst_03 = (float32_t *)p_dsts[3];
+  float32_t *dst_10 = (float32_t *)p_dsts[4];
+  float32_t *dst_11 = (float32_t *)p_dsts[5];
+  float32_t *dst_12 = (float32_t *)p_dsts[6];
+  float32_t *dst_13 = (float32_t *)p_dsts[7];
+  float32_t *dst_20 = (float32_t *)p_dsts[8];
+  float32_t *dst_21 = (float32_t *)p_dsts[9];
+  float32_t *dst_22 = (float32_t *)p_dsts[10];
+  float32_t *dst_23 = (float32_t *)p_dsts[11];
+  float32_t *dst_30 = (float32_t *)p_dsts[12];
+  float32_t *dst_31 = (float32_t *)p_dsts[13];
+  float32_t *dst_32 = (float32_t *)p_dsts[14];
+  float32_t *dst_33 = (float32_t *)p_dsts[15];
 
 #if ARMRAL_ARCH_SVE >= 2
   for (uint32_t mat_i = 0; mat_i < num_mats; mat_i += 4) {
@@ -2523,7 +2523,7 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
   store_quadrant<8>(d_out, 1, 1, p_dst);
 #else
 
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
   float32x4_t mat_a[8];
   float32x4_t mat_b[8];
   float32x4_t mat_c[8];
@@ -2553,9 +2553,9 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
     p_mat += 4;
   }
 
-  float *p_mat_a = (float *)mat_a;
-  float *p_mat_b = (float *)mat_b;
-  float *p_mat_c = (float *)mat_c;
+  float32_t *p_mat_a = (float32_t *)mat_a;
+  float32_t *p_mat_b = (float32_t *)mat_b;
+  float32_t *p_mat_c = (float32_t *)mat_c;
 
   /*Calculate inverse sublock A */
   float32x4_t inv_a[8];
@@ -2618,7 +2618,7 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
     block11[i] = vaddq_f32(inv_a[i], temp_mat2[i]);
   }
 
-  float *p_inv = (float *)p_dst;
+  float32_t *p_inv = (float32_t *)p_dst;
 
   vst1q_f32(p_inv, block11[0]);
   p_inv += 4;
@@ -2743,7 +2743,7 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
   store_quadrant<16>(c_out, 1, 0, p_dst);
   store_quadrant<16>(d_out, 1, 1, p_dst);
 #else
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
   float32x4_t mat_a[32];
   float32x4_t mat_b[32];
   float32x4_t mat_c[32];
@@ -2789,9 +2789,9 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
     p_mat += 4;
   }
 
-  float *p_mat_a = (float *)mat_a;
-  float *p_mat_b = (float *)mat_b;
-  float *p_mat_c = (float *)mat_c;
+  float32_t *p_mat_a = (float32_t *)mat_a;
+  float32_t *p_mat_b = (float32_t *)mat_b;
+  float32_t *p_mat_c = (float32_t *)mat_c;
 
   /*Calculate inverse sublock A */
   float32x4_t inv_a[32];
@@ -2864,7 +2864,7 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
     block11[2 * i + 1] = vaddq_f32(inv_a[2 * i + 1], temp_mat2[2 * i + 1]);
   }
 
-  float *p_inv = (float *)p_dst;
+  float32_t *p_inv = (float32_t *)p_dst;
 
   vst1q_f32(p_inv, block11[2 * 0 + 0]);
   p_inv += 4;
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
index fccc80a6fa37031f62d10789d7fea3b772f7994d..7105e2ba2a58b4150c7fd65b63f8d4e913c1b648 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -668,7 +668,7 @@ static void invert_batch_matrix_2x2_impl(
   // det = a * d - b * c
   armral_cmplx_f32_t det = scal_minor_cmplx_f32(a, b, c, d);
   // inv_det = 1 / det = conj(det) / abs(det)^2
-  float det_abs2_inv = 1.0F / scal_mod2_cmplx_f32(det).re;
+  float32_t det_abs2_inv = 1.0F / scal_mod2_cmplx_f32(det).re;
   armral_cmplx_f32_t inv_det = {det.re * det_abs2_inv, -det.im * det_abs2_inv};
   armral_cmplx_f32_t minus_inv_det = {-inv_det.re, -inv_det.im};
   // p_dst = | d, -b | * inv_det
diff --git a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
index cb47a3ed62ab934d97f21810085af4f8108476cf..49eebadac455de28c61b5a323c80e0619bfd4841 100644
--- a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
+++ b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace armral::cmplx_herm_mat_inv {
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
index 29d168396c7040d61fb3e6b7079516c1402e0465..339080ca14fdd5ed8cff3c8394dba403daf28ba4 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
index ddad389b10aa580d9ff12368b2de0ea315d7960c..15d40f5f8f2bb58748546b45d70fd539048ea0e7 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -19,6 +19,38 @@ static void cmplx_mat_mult_ahb_b2x2(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 2;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p4 = svptrue_pat_b32(SV_VL4);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+  svfloat32_t b_r0 = svld1_f32(p4, (float32_t const *)p_src_b);
+  svfloat32_t b_r1 = svld1_f32(p4, ((float32_t const *)p_src_b) + 4);
+  svfloat32_t b_r0_rev = svrev64_f32(p4, b_r0);
+  svfloat32_t b_r1_rev = svrev64_f32(p4, b_r1);
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t acc_1;
+    svfloat32_t acc_2;
+
+    // r = 0
+    svfloat32_t a_r0j = svld1_f32(p2, (float32_t const *)&p_src_a[j]);
+    // [R0*r, R0*i, R0*r, R0*i]
+    acc_1 = svmul_lane_f32(b_r0, a_r0j, 0);
+    // [I0*i, I0*r, I0*i, I0*r]
+    acc_2 = svmul_lane_f32(b_r0_rev, a_r0j, 1);
+
+    // r = 1
+    svfloat32_t a_r1j = svld1_f32(p2, (float32_t const *)&p_src_a[n + j]);
+    // [R0*r, R0*i, R0*r, R0*i]
+    acc_1 = svmla_lane_f32(acc_1, b_r1, a_r1j, 0);
+    // [I0*i, I0*r, I0*i, I0*r]
+    acc_2 = svmla_lane_f32(acc_2, b_r1_rev, a_r1j, 1);
+
+    svfloat32_t result = svadd_f32_x(
+        p4, acc_1,
+        svreinterpret_f32_f64(svneg_f64_x(p4, svreinterpret_f64_f32(acc_2))));
+    svst1_f32(p4, (float32_t *)&p_dst[mk * j], result);
+  }
+#else
   float32x4_t b_r0 = vld1q_f32((float32_t const *)p_src_b);
   float32x4_t b_r1 = vld1q_f32(((float32_t const *)p_src_b) + 4);
   float32x4_t b_r0_rev = vrev64q_f32(b_r0);
@@ -46,6 +78,7 @@ static void cmplx_mat_mult_ahb_b2x2(uint16_t n,
 
     vst1q_f32((float32_t *)&p_dst[mk * j], result);
   }
+#endif
 }
 
 static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
@@ -54,6 +87,42 @@ static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 3;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p3 = svptrue_pat_b32(SV_VL3);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+
+  svfloat32x2_t b_r0 = svld2_f32(p3, (float32_t const *)&p_src_b[0]);
+  svfloat32x2_t b_r1 = svld2_f32(p3, (float32_t const *)&p_src_b[3]);
+  svfloat32x2_t b_r2 = svld2_f32(p3, (float32_t const *)&p_src_b[6]);
+  svfloat32x2_t *b_rows[] = {&b_r0, &b_r1, &b_r2};
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t dot_0 = svundef_f32();
+    svfloat32_t dot_1 = svundef_f32();
+    // Note: We leave it to the compiler to unroll this loop over mk
+    for (uint32_t r = 0; r < mk; r++) {
+      svfloat32_t a_rj = svld1_f32(p2, (float32_t const *)&p_src_a[r * n + j]);
+      // Note: We leave it to the compiler to eliminate the following branch
+      if (r == 0) {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmul_lane_f32(svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmul_lane_f32(svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      } else {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmla_lane_f32(dot_1, svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      }
+    }
+    svfloat32x2_t dot = svcreate2(dot_0, dot_1);
+    svst2_f32(p3, (float32_t *)&p_dst[mk * j], dot);
+  }
+#else
   // Copy the final row of B so we can safely read one extra column:
   armral_cmplx_f32_t final_row[4];
   memcpy(final_row, &p_src_b[6], sizeof(armral_cmplx_f32_t) * 3);
@@ -100,6 +169,7 @@ static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
     // Store the remaining column:
     vst1_f32(((float32_t *)&p_dst[mk * j]) + 4, vget_low_f32(result.val[1]));
   }
+#endif
 }
 
 static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
@@ -108,6 +178,44 @@ static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 4;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p4 = svptrue_pat_b32(SV_VL4);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+
+  svfloat32x2_t b_r0 = svld2_f32(p4, (float32_t const *)&p_src_b[0]);
+  svfloat32x2_t b_r1 = svld2_f32(p4, (float32_t const *)&p_src_b[4]);
+  svfloat32x2_t b_r2 = svld2_f32(p4, (float32_t const *)&p_src_b[8]);
+  svfloat32x2_t b_r3 = svld2_f32(p4, (float32_t const *)&p_src_b[12]);
+  svfloat32x2_t *b_rows[] = {&b_r0, &b_r1, &b_r2, &b_r3};
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t dot_0 = svundef_f32();
+    svfloat32_t dot_1 = svundef_f32();
+    // Note: We leave it to the compiler to unroll this loop over mk
+    for (uint32_t r = 0; r < mk; r++) {
+      svfloat32_t a_rj = svld1_f32(p2, (float32_t const *)&p_src_a[r * n + j]);
+      // Note: We leave it to the compiler to eliminate the following branch
+      if (r == 0) {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmul_lane_f32(svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmul_lane_f32(svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      } else {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmla_lane_f32(dot_1, svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      }
+    }
+
+    svfloat32x2_t dot = svcreate2(dot_0, dot_1);
+    svst2_f32(p4, (float32_t *)&p_dst[mk * j], dot);
+  }
+#else
   float32x4x2_t b_rows[4] = {vld2q_f32((float32_t const *)&p_src_b[0]),
                              vld2q_f32((float32_t const *)&p_src_b[4]),
                              vld2q_f32((float32_t const *)&p_src_b[8]),
@@ -137,6 +245,7 @@ static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
     }
     vst2q_f32((float32_t *)&p_dst[mk * j], dot);
   }
+#endif
 }
 
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
index 4712548218b0c65941223535073c1d25f3baacaa..b24f257068c972ea7a777377285d7f3d6c2095c4 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -642,8 +642,8 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
   }
 #else
 
-  const float *p_in1 = (const float *)p_src_a;
-  const float *p_in2 = (const float *)p_src_b;
+  const float32_t *p_in1 = (const float32_t *)p_src_a;
+  const float32_t *p_in2 = (const float32_t *)p_src_b;
   const armral_cmplx_f32_t *p_in_a = p_src_a;
   armral_cmplx_f32_t *p_out = p_dst;
   armral_cmplx_f32_t *px;
@@ -664,8 +664,8 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
   float32x4_t b_col_real2;
   float32x4_t b_col_im2;
   float32x2_t accum = vdup_n_f32(0);
-  const float *p_in1_b = (const float *)p_src_a;
-  const float *p_in1_b2 = (const float *)p_src_b;
+  const float32_t *p_in1_b = (const float32_t *)p_src_a;
+  const float32_t *p_in1_b2 = (const float32_t *)p_src_b;
 
   uint16_t col;
   uint16_t i = 0U;
@@ -690,7 +690,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
     /* For every row wise process, the pIn2 pointer is set
      ** to the starting address of the pSrcB data */
-    p_in2 = (const float *)p_src_b;
+    p_in2 = (const float32_t *)p_src_b;
     p_in1_b2 = p_in2 + 2 * num_cols_b;
 
     j = 0U;
@@ -721,7 +721,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_b = p_in1 + 2 * num_cols_a;
 
       float32x4_t acc_r0 = {};
@@ -883,7 +883,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
       // /* Update the pointer pIn2 to point to the  starting address of the
       // next column */
       j++;
-      p_in2 = (const float *)p_src_b + 4U * j;
+      p_in2 = (const float32_t *)p_src_b + 4U * j;
       p_in1_b2 = p_in2 + 2U * num_cols_b;
       col--;
     }
@@ -902,7 +902,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_b = p_in1 + 2 * num_cols_a;
 
       float32x4_t acc_r0 = {};
@@ -1043,7 +1043,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
     /* For every row wise process, the pIn2 pointer is set
      ** to the starting address of the pSrcB data */
-    p_in2 = (const float *)p_src_b;
+    p_in2 = (const float32_t *)p_src_b;
 
     j = 0U;
 
@@ -1058,7 +1058,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
 
       float32x4_t acc_r0 = {};
       float32x4_t acc_i0 = {};
@@ -1144,7 +1144,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
       /* Update the pointer pIn2 to point to the  starting address of the next
        * column */
       j++;
-      p_in2 = (const float *)p_src_b + 2U * j;
+      p_in2 = (const float32_t *)p_src_b + 2U * j;
 
       /* Decrement the column loop counter */
       col--;
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
index f1b33411668af96e551912f4142897475610ba41..8aa33c5db84bc28e7b03dcff719e563860b14658 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
index fdfe709dac1f4dc6a7f596e31e3ad1ed0c90dabb..54b182ec5d53d93de51cb5ec69999adfdb906a53 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
index 631c0b18c5315209bda20a5ebee8c33d6eff2163..83f9ec108e44925ad66618d43095659d027109d1 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -15,8 +15,8 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
                               const armral_cmplx_f32_t *restrict p_src_a,
                               const armral_cmplx_f32_t *restrict p_src_x,
                               armral_cmplx_f32_t *p_dst) {
-  const float *p_in1 = (const float *)p_src_a;
-  const float *p_in2 = (const float *)p_src_x;
+  const float32_t *p_in1 = (const float32_t *)p_src_a;
+  const float32_t *p_in2 = (const float32_t *)p_src_x;
   const armral_cmplx_f32_t *p_in_a = p_src_a;
   armral_cmplx_f32_t *p_out = p_dst;
   uint16_t num_rows_a = m; // number of rows of input matrix A
@@ -25,18 +25,18 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
 #ifdef ARMRAL_ARCH_SVE
   svbool_t ptrue = svptrue_b32();
   if (num_rows_a % 2 == 0) {
-    const float *p_in1_2 = (const float *)p_src_a;
+    const float32_t *p_in1_2 = (const float32_t *)p_src_a;
     // Loop over A two rows at a time
     for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
          row_cnt -= 2, p_out += 2, p_in_a += 2 * num_cols_a) {
-      // Initialise p_in1 and p_in1_2 to point to the starting addresses of the
+      // Initialize p_in1 and p_in1_2 to point to the starting addresses of the
       // current rows
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_2 = p_in1 + 2 * num_cols_a;
 
       // For every row wise process, the pIn2 pointer is set
       // to the starting address of the pSrcX data
-      p_in2 = (const float *)p_src_x;
+      p_in2 = (const float32_t *)p_src_x;
 
       // Initialize accumulators
       svfloat32_t sum = svdup_n_f32(0);
@@ -107,12 +107,12 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
     for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
          --row_cnt, p_out++, p_in_a += num_cols_a) {
 
-      // Initialise p_in1 to point to the starting address of the current row
-      p_in1 = (const float *)p_in_a;
+      // Initialize p_in1 to point to the starting address of the current row
+      p_in1 = (const float32_t *)p_in_a;
 
       // For every row wise process, the pIn2 pointer is set
       // to the starting address of the pSrcX data
-      p_in2 = (const float *)p_src_x;
+      p_in2 = (const float32_t *)p_src_x;
 
       // Initialize accumulators
       svfloat32_t sum = svdup_n_f32(0);
@@ -167,12 +167,12 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
   for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
        --row_cnt, ++p_out, p_in_a += num_cols_a) {
 
-    // Initialise p_in1 to point to the starting address of the current row
-    p_in1 = (const float *)p_in_a;
+    // Initialize p_in1 to point to the starting address of the current row
+    p_in1 = (const float32_t *)p_in_a;
 
     // For every row wise process, the pIn2 pointer is set
     // to the starting address of the pSrcX data
-    p_in2 = (const float *)p_src_x;
+    p_in2 = (const float32_t *)p_src_x;
 
     // Initialize accumulators
     float32_t acc_re = 0.0;
@@ -255,8 +255,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
       float32x2_t acc_3;
       float32x2_t acc_4;
       {
-        float32x4_t a_vec = vld1q_f32((float const *)current_a);
-        float32x2x2_t x_vec = vld1_f32_x2((float const *)current_x);
+        float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+        float32x2x2_t x_vec = vld1_f32_x2((float32_t const *)current_x);
         acc_1 = vmul_laneq_f32(x_vec.val[0], a_vec, 0);
         acc_2 = vmul_laneq_f32(x_vec.val[0], a_vec, 1);
         acc_3 = vmul_laneq_f32(x_vec.val[1], a_vec, 2);
@@ -265,8 +265,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
         current_a += num_mats;
       }
       for (uint32_t col = 1; col < n; col++) {
-        float32x4_t a_vec = vld1q_f32((float const *)current_a);
-        float32x2x2_t x_vec = vld1_f32_x2((float const *)current_x);
+        float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+        float32x2x2_t x_vec = vld1_f32_x2((float32_t const *)current_x);
         acc_1 = vfma_laneq_f32(acc_1, x_vec.val[0], a_vec, 0);
         acc_2 = vfma_laneq_f32(acc_2, x_vec.val[0], a_vec, 1);
         acc_3 = vfma_laneq_f32(acc_3, x_vec.val[1], a_vec, 2);
@@ -279,8 +279,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
           vadd_f32(acc_1, vneg64_f32(vneg_f32(vrev64_f32(acc_2))));
       float32x2_t result_2 =
           vadd_f32(acc_3, vneg64_f32(vneg_f32(vrev64_f32(acc_4))));
-      vst1_f32((float *)(out_ptr + 0), result_1);
-      vst1_f32((float *)(out_ptr + 1), result_2);
+      vst1_f32((float32_t *)(out_ptr + 0), result_1);
+      vst1_f32((float32_t *)(out_ptr + 1), result_2);
 
       a_current_row_start += num_mats * n;
       out_ptr += num_mats;
@@ -328,9 +328,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x4_t acc_3;
         float32x4_t acc_4;
         {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x4_t x1_vec = vld1q_f32((float const *)current_x_1);
-          float32x4_t x2_vec = vld1q_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x4_t x1_vec = vld1q_f32((float32_t const *)current_x_1);
+          float32x4_t x2_vec = vld1q_f32((float32_t const *)current_x_2);
           acc_1 = vmulq_laneq_f32(x1_vec, a_vec, 0);
           acc_2 = vmulq_laneq_f32(x1_vec, a_vec, 1);
           acc_3 = vmulq_laneq_f32(x2_vec, a_vec, 2);
@@ -340,9 +340,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
           current_a += num_mats;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x4_t x1_vec = vld1q_f32((float const *)current_x_1);
-          float32x4_t x2_vec = vld1q_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x4_t x1_vec = vld1q_f32((float32_t const *)current_x_1);
+          float32x4_t x2_vec = vld1q_f32((float32_t const *)current_x_2);
           acc_1 = vfmaq_laneq_f32(acc_1, x1_vec, a_vec, 0);
           acc_2 = vfmaq_laneq_f32(acc_2, x1_vec, a_vec, 1);
           acc_3 = vfmaq_laneq_f32(acc_3, x2_vec, a_vec, 2);
@@ -356,8 +356,8 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
             vaddq_f32(acc_1, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_2))));
         float32x4_t result_2 =
             vaddq_f32(acc_3, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_4))));
-        vst1q_f32((float *)out_ptr_1, result_1);
-        vst1q_f32((float *)out_ptr_2, result_2);
+        vst1q_f32((float32_t *)out_ptr_1, result_1);
+        vst1q_f32((float32_t *)out_ptr_2, result_2);
 
         a_current_row_start += num_mats * n;
         out_ptr_1 += vec_stride;
@@ -385,9 +385,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x2_t acc_3;
         float32x2_t acc_4;
         {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x2_t x1_vec = vld1_f32((float const *)current_x_1);
-          float32x2_t x2_vec = vld1_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x2_t x1_vec = vld1_f32((float32_t const *)current_x_1);
+          float32x2_t x2_vec = vld1_f32((float32_t const *)current_x_2);
           acc_1 = vmul_laneq_f32(x1_vec, a_vec, 0);
           acc_2 = vmul_laneq_f32(x1_vec, a_vec, 1);
           acc_3 = vmul_laneq_f32(x2_vec, a_vec, 2);
@@ -397,9 +397,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
           current_a += num_mats;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x2_t x1_vec = vld1_f32((float const *)current_x_1);
-          float32x2_t x2_vec = vld1_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x2_t x1_vec = vld1_f32((float32_t const *)current_x_1);
+          float32x2_t x2_vec = vld1_f32((float32_t const *)current_x_2);
           acc_1 = vfma_laneq_f32(acc_1, x1_vec, a_vec, 0);
           acc_2 = vfma_laneq_f32(acc_2, x1_vec, a_vec, 1);
           acc_3 = vfma_laneq_f32(acc_3, x2_vec, a_vec, 2);
@@ -413,8 +413,8 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
             vadd_f32(acc_1, vneg64_f32(vneg_f32(vrev64_f32(acc_2))));
         float32x2_t result_2 =
             vadd_f32(acc_3, vneg64_f32(vneg_f32(vrev64_f32(acc_4))));
-        vst1_f32((float *)out_ptr_1, result_1);
-        vst1_f32((float *)out_ptr_2, result_2);
+        vst1_f32((float32_t *)out_ptr_1, result_1);
+        vst1_f32((float32_t *)out_ptr_2, result_2);
 
         a_current_row_start += num_mats * n;
         out_ptr_1 += vec_stride;
@@ -438,16 +438,16 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x4_t acc_1;
         float32x4_t acc_2;
         {
-          float32x2_t a_vec = vld1_f32((float const *)current_a);
-          float32x4_t x_vec = vld1q_f32((float const *)current_x);
+          float32x2_t a_vec = vld1_f32((float32_t const *)current_a);
+          float32x4_t x_vec = vld1q_f32((float32_t const *)current_x);
           acc_1 = vmulq_lane_f32(x_vec, a_vec, 0);
           acc_2 = vmulq_lane_f32(x_vec, a_vec, 1);
           current_a += num_mats;
           current_x += vec_stride;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x2_t a_vec = vld1_f32((float const *)current_a);
-          float32x4_t x_vec = vld1q_f32((float const *)current_x);
+          float32x2_t a_vec = vld1_f32((float32_t const *)current_a);
+          float32x4_t x_vec = vld1q_f32((float32_t const *)current_x);
           acc_1 = vfmaq_lane_f32(acc_1, x_vec, a_vec, 0);
           acc_2 = vfmaq_lane_f32(acc_2, x_vec, a_vec, 1);
           current_a += num_mats;
@@ -456,7 +456,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
 
         float32x4_t result =
             vaddq_f32(acc_1, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_2))));
-        vst1q_f32((float *)out_ptr, result);
+        vst1q_f32((float32_t *)out_ptr, result);
 
         out_ptr += vec_stride;
         a_current_row_start += num_mats * n;
@@ -488,18 +488,18 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32_pa(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
         // Set the accumulator variables to zero
-        float sum_real1 = 0;
-        float sum_imag1 = 0;
-        float sum_real2 = 0;
-        float sum_imag2 = 0;
+        float32_t sum_real1 = 0;
+        float32_t sum_imag1 = 0;
+        float32_t sum_real2 = 0;
+        float32_t sum_imag2 = 0;
 
         // Loop over the row of A one column at a time
         for (uint16_t col_cnt = 0; col_cnt < n; ++col_cnt) {
           int i = (row_cnt * n) + col_cnt;
-          float a_re = p_srcs_a[i][mat_idx].re;
-          float a_im = p_srcs_a[i][mat_idx].im;
-          float x_re = p_srcs_x[col_cnt][vec_idx].re;
-          float x_im = p_srcs_x[col_cnt][vec_idx].im;
+          float32_t a_re = p_srcs_a[i][mat_idx].re;
+          float32_t a_im = p_srcs_a[i][mat_idx].im;
+          float32_t x_re = p_srcs_x[col_cnt][vec_idx].re;
+          float32_t x_im = p_srcs_x[col_cnt][vec_idx].im;
           sum_real1 += a_re * x_re;
           sum_imag1 += a_im * x_re;
           sum_real2 -= a_im * x_im;
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
index 1f2ba56b952477bcb8c7bd4b8b7fe7227a5741c7..325b206829122b4a5afae26939a58a9fef745126 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -47,7 +47,7 @@ armral_cmplx_mat_vec_mult_i16(const uint16_t m, const uint16_t n,
     int64_t sum_real1_b_ext = 0;
     int64_t sum_imag1_b_ext = 0;
 
-    // Initialise the pointer pIn1 to point to the starting address of the
+    // Initialize the pointer pIn1 to point to the starting address of the
     // column being processed
     p_in1 = (const int16_t *)p_in_a;
     p_in1_b = p_in1 + 2 * num_cols_a;
@@ -259,7 +259,7 @@ armral_cmplx_mat_vec_mult_i16(const uint16_t m, const uint16_t n,
     int64_t sum_real2_ext = 0;
     int64_t sum_imag2_ext = 0;
 
-    // Initialise the pointer pIn1 to point to the starting address of the row
+    // Initialize the pointer pIn1 to point to the starting address of the row
     // being processed
     p_in1 = (const int16_t *)p_in_a;
 
@@ -444,7 +444,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = m; row_cnt > 0;
            --row_cnt, p_out += vec_stride, p_in_a += num_mats * n) {
 
-        // Initialise pIn1 to point to the starting address of the current row
+        // Initialize pIn1 to point to the starting address of the current row
         const int16_t *p_in1 = (const int16_t *)p_in_a;
 
         // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -553,7 +553,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = m; row_cnt > 0;
            --row_cnt, p_out += vec_stride, p_in_a += num_mats * n) {
 
-        // Initialise pIn1 to point to the starting address of the current row
+        // Initialize pIn1 to point to the starting address of the current row
         const int16_t *p_in1 = (const int16_t *)p_in_a;
 
         // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -654,7 +654,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, p_out += num_mats, p_in_a += num_mats * n) {
 
-      // Initialise pIn1 to point to the starting address of the current row
+      // Initialize pIn1 to point to the starting address of the current row
       const int16_t *p_in1 = (const int16_t *)p_in_a;
 
       // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -734,7 +734,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, p_out += num_mats, p_in_a += num_mats * n) {
 
-      // Initialise pIn1 to point to the starting address of the current row
+      // Initialize pIn1 to point to the starting address of the current row
       const int16_t *p_in1 = (const int16_t *)p_in_a;
 
       // For every row, pIn2 is set to the starting address of the pSrcX data
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
index 5f0c082855ba5d202e599dd611140c6e82f439c5..f7fa1d50676624b8277d92670727789ae430c22a 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -87,7 +87,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, a_ptr += num_mats * n, out_ptr += num_mats) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       int32x4_t sum_real_lo;
       int32x4_t sum_real_hi;
       int32x4_t sum_imag_lo;
@@ -166,7 +166,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, a_ptr += num_mats * n, out_ptr += num_mats) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       svint32_t sum_real;
       svint32_t sum_imag;
 
@@ -238,7 +238,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = 0; row_cnt < m;
            ++row_cnt, a_ptr += num_mats * n, out_ptr += vec_stride) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         int32x4_t sum_real_lo;
         int32x4_t sum_real_hi;
         int32x4_t sum_imag_lo;
@@ -314,7 +314,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = 0; row_cnt < m;
            ++row_cnt, a_ptr += num_mats * n, out_ptr += vec_stride) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         svint32_t sum_real;
         svint32_t sum_imag;
 
@@ -391,7 +391,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_one_vec(
     // Loop over A one row at a time
     for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       int32x4_t sum_real;
       int32x4_t sum_imag;
 
@@ -446,7 +446,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_one_vec(
     // Loop over A one row at a time
     for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       svint32_t sum_real;
       svint32_t sum_imag;
 
@@ -506,7 +506,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_unroll_vec(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         int32x4_t sum_real_lo;
         int32x4_t sum_real_hi;
         int32x4_t sum_imag_lo;
@@ -570,7 +570,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_unroll_vec(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         svint32_t sum_real;
         svint32_t sum_imag;
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
index 6717b10ecfd9ddcce0a1d72b226958a0cddbd71a..18124b58c9af8a43f6e1d07b194e05353c386285 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_convert.h"
@@ -16,13 +16,14 @@ armral_status armral_solve_2x2_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const uint32_t p_xstride, const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[2] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1]};
+  float32_t y_shifts[2] = {1 << p_y_num_fract_bits[0],
+                           1 << p_y_num_fract_bits[1]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
@@ -170,14 +171,15 @@ armral_status armral_solve_2x4_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const uint32_t p_xstride, const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1],
-                       1 << p_y_num_fract_bits[2], 1 << p_y_num_fract_bits[3]};
+  float32_t y_shifts[4] = {
+      1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1],
+      1 << p_y_num_fract_bits[2], 1 << p_y_num_fract_bits[3]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
@@ -356,13 +358,13 @@ armral_status armral_solve_4x4_1sc_f32(
     uint32_t p_gstride, armral_cmplx_int16_t *p_x, uint32_t p_xstride,
     armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {
+  float32_t y_shifts[4] = {
       1U << (16 + p_y_num_fract_bits[0]), 1U << (16 + p_y_num_fract_bits[1]),
       1U << (16 + p_y_num_fract_bits[2]), 1U << (16 + p_y_num_fract_bits[3])};
   svbool_t pg16;
@@ -657,13 +659,13 @@ armral_status armral_solve_1x4_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {
+  float32_t y_shifts[4] = {
       1U << (16 + p_y_num_fract_bits[0]), 1U << (16 + p_y_num_fract_bits[1]),
       1U << (16 + p_y_num_fract_bits[2]), 1U << (16 + p_y_num_fract_bits[3])};
   svbool_t pg16;
@@ -797,13 +799,14 @@ armral_status armral_solve_1x2_1sc_f32(
     uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[2] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1]};
+  float32_t y_shifts[2] = {1 << p_y_num_fract_bits[0],
+                           1 << p_y_num_fract_bits[1]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
index b0f9c8ab97ef9ce41aef4d8becc8f0b22a28fd2b..c542d2e6752cc1ff7682b7e45b8b60e529e15a3a 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
index 56bd0d8dc47269acf670311d076022e5ffb1bdad..e9a43c23c5b7d0f4c11543c308c5d3ef8551ed33 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_4sc.h"
 #include "arm_solve_convert.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
index f2963b814e354985faa3bd46fc1c64332250de9c..f6854b7a6e8e46c537b15da3aae5ccce10778c1d 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
index edadced8061defad601ec118ed44cad8802bb982..94dfcdca0364deeb1f991918db44725220965343 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_6sc.h"
 #include "arm_solve_convert.h"
@@ -472,7 +472,7 @@ armral_status armral_solve_4x4_6sc_f32(
                        1. / (1U << (16 + p_y_num_fract_bits[2])),
                        1. / (1U << (16 + p_y_num_fract_bits[3]))};
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
   // The loop is unrolled so that 2 matrices are computed per iteration,
   // therefore 12 subcarriers (i.e. y vectors) are used in each iteration
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
index 52bcf9da786e3ab4878e8b200158468b6c043418..249dbae601b20ea8d6733bf2215541e09d6c8ea4 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_convert.h b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
index acc88428d4c507f25fe86f00ba73aa6f7a29f348..cd8cb13ba6c589fa7622ce0b774693528fcb4227 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_convert.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -62,7 +62,7 @@ armral_convert_f32_i16_x8(const armral_fixed_point_index num_fract_bits,
                           const float32x4_t *in_vector1_re,
                           const float32x4_t *in_vector1_im,
                           int16x8_t *out_vector0, int16x8_t *out_vector1) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re[0] * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im[0] * x_mult);
@@ -93,7 +93,7 @@ armral_convert_f32_i16_x8_2(const armral_fixed_point_index num_fract_bits,
                             const float32x4_t *in_vector0_re,
                             const float32x4_t *in_vector0_im,
                             int16x8_t *out_vector0) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re[0] * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im[0] * x_mult);
@@ -114,7 +114,7 @@ armral_convert_f32_i16_x4(const armral_fixed_point_index num_fract_bits,
                           float32x4_t in_vector0_re, float32x4_t in_vector0_im,
                           float32x4_t in_vector1_re, float32x4_t in_vector1_im,
                           int16x8_t *out_vector0, int16x8_t *out_vector1) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im * x_mult);
@@ -134,7 +134,7 @@ static inline void __attribute__((always_inline))
 armral_convert_f32_i16_x4_2(const armral_fixed_point_index num_fract_bits,
                             float32x4_t in_vector0_re,
                             float32x4_t in_vector0_im, int16x8_t *out_vector0) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im * x_mult);
@@ -152,7 +152,7 @@ armral_convert_f32_fixed_i16(const armral_fixed_point_index num_fract_bits,
                              const svfloat32_t in_vector_re,
                              const svfloat32_t in_vector_im,
                              const svbool_t pg32) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
   svfloat32_t in32_re = svmul_n_f32_x(pg32, in_vector_re, x_mult);
   svfloat32_t in32_im = svmul_n_f32_x(pg32, in_vector_im, x_mult);
   svint32_t res_re = svcvt_s32_f32_x(pg32, in32_re);
@@ -169,4 +169,4 @@ armral_convert_f32_i16(const svfloat32_t in_vector_re,
   svint16_t res_16_re = svqxtnb_s32(res_re);
   return svqxtnt_s32(res_16_re, res_im);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_f32.c b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
index 75dfecd78e6854c094acd007e9e18bfbd59f982d..62cf87a02d03b678dfd9ab71692ca960091cf8bd 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_4sc.h"
diff --git a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
index 999db43406e52bd538fa3f73885964e2c420d746..ebca77ca94ff8978062bae7f328f9cee68fc99b0 100644
--- a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+++ b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
@@ -24,12 +24,17 @@ void left_pseudo_inverse(uint16_t m, const float32_t lambda,
   armral_cmplx_mat_mult_ahb_f32(m, n, n, p_src, p_src, mat_aha);
 
   // Compute C += lambda * I
-  armral::cmplx_mat_pseudo_inv::add_lambda<n>(lambda, p_dst);
+  armral::cmplx_mat_pseudo_inv::add_lambda<n>(lambda, mat_aha);
 
   // Compute B = C^(-1)
   auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n * n);
-  armral::cmplx_herm_mat_inv::invert_hermitian_matrix<n>(mat_aha,
-                                                         mat_inv.get());
+  if constexpr (n == 1) {
+    mat_inv[0].re = 1.F / mat_aha[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    armral::cmplx_herm_mat_inv::invert_hermitian_matrix<n>(mat_aha,
+                                                           mat_inv.get());
+  }
 
   // Compute B * A^H
   armral::cmplx_mat_pseudo_inv::mat_mult_bah_f32(m, n, p_src, mat_inv.get(),
@@ -51,8 +56,13 @@ void right_pseudo_inverse(uint16_t n, const float32_t lambda,
 
   // Compute B = C^(-1)
   auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, m * m);
-  armral::cmplx_herm_mat_inv::invert_hermitian_matrix<m>(mat_aah,
-                                                         mat_inv.get());
+  if constexpr (m == 1) {
+    mat_inv[0].re = 1.F / mat_aah[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    armral::cmplx_herm_mat_inv::invert_hermitian_matrix<m>(mat_aah,
+                                                           mat_inv.get());
+  }
 
   // Compute A^H * B
   armral_cmplx_mat_mult_ahb_f32(m, n, m, p_src, mat_inv.get(), p_dst);
@@ -73,6 +83,10 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda,
   // columns then use the left pseudo-inverse
   if (m > n) {
     switch (n) {
+    case 1: {
+      left_pseudo_inverse<1>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
     case 2: {
       left_pseudo_inverse<2>(m, lambda, p_src, p_dst, allocator);
       break;
@@ -103,6 +117,10 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda,
   // If the number of rows in the input matrix is less than or equal to the number
   // of columns then use the right pseudo-inverse
   switch (m) {
+  case 1: {
+    right_pseudo_inverse<1>(n, lambda, p_src, p_dst, allocator);
+    break;
+  }
   case 2: {
     right_pseudo_inverse<2>(n, lambda, p_src, p_dst, allocator);
     break;
diff --git a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
index 04525b9a22c781c19707c0eb4bfbac680d6b3136..c4d80719c7bdde8d28d497b8f21b4c48e2e6c3f2 100644
--- a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
+++ b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace armral::cmplx_mat_pseudo_inv {
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
index 98134622d1b3c9223fd60f35d7a147603fd804d6..9d95aa2c1c7a3da4aa6de4a1b56a2ae13037ba18 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -21,10 +21,10 @@ armral_cmplx_vecdot_f32(int32_t n, const armral_cmplx_f32_t *restrict p_src_a,
   int32_t i = 0;
   for (; i * num_lanes <= n - 2 * num_lanes; i += 2) {
     svbool_t pg = svptrue_b32();
-    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float *)p_src_b, i);
-    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float *)p_src_a, i + 1);
-    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float *)p_src_b, i + 1);
+    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
+    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i + 1);
+    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i + 1);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_a0, vec_b0, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_a0, vec_b0, 90);
@@ -34,8 +34,8 @@ armral_cmplx_vecdot_f32(int32_t n, const armral_cmplx_f32_t *restrict p_src_a,
 
   for (; i * num_lanes < n; ++i) {
     svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * n);
-    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float *)p_src_b, i);
+    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_a, vec_b, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_a, vec_b, 90);
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
index 4549d4a31537822ff5fb32052f589316f483ad2a..4a2da77d7844c985023f6907204e7afe1163b62b 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
@@ -8,11 +8,12 @@
 #endif
 
 armral_status armral_cmplx_vecdot_f32_2(int32_t n,
-                                        const float *restrict p_src_a_re,
-                                        const float *restrict p_src_a_im,
-                                        const float *restrict p_src_b_re,
-                                        const float *restrict p_src_b_im,
-                                        float *p_src_c_re, float *p_src_c_im) {
+                                        const float32_t *restrict p_src_a_re,
+                                        const float32_t *restrict p_src_a_im,
+                                        const float32_t *restrict p_src_b_re,
+                                        const float32_t *restrict p_src_b_im,
+                                        float32_t *p_src_c_re,
+                                        float32_t *p_src_c_im) {
 #ifdef ARMRAL_ARCH_SVE
   int32_t num_lanes = svcntw();
   int32_t full_vectors = n / num_lanes;
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
index 82ff5c4137d5285b00e371ac4845857f843bd22b..80054329d5112cddd17129b1039986961eb19ac2 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
index 36d404383097d09fa75f88c0146560ed69c5102e..23516b9ddc407b40a8a72d5470efb7a5a3a0ffb5 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
index 1bceca56b3748472961fbf26cd8ee2217ec0dbef..acf15f3928cc092742cc5ee6efc9ba222c80d77a 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
index 9a40c00668fd86cde7c403cef05c170199b7fc82..0eb7ab5dbfc6ce254627e5413c9d6c7b3f001f9b 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
index a6660f20fe3e780bc6877ea7ca26413c6a87ca84..36de2e3412e52efc711445ef8362fda6ba67f9cc 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
@@ -17,14 +17,14 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
   svbool_t pg = svptrue_b32();
 
   for (int32_t i = 0; i < full_vectors; i++) {
-    svfloat32_t vec_a = svld1_f32(pg, (const float *)a);
-    svfloat32_t vec_b = svld1_f32(pg, (const float *)b);
+    svfloat32_t vec_a = svld1_f32(pg, (const float32_t *)a);
+    svfloat32_t vec_b = svld1_f32(pg, (const float32_t *)b);
     svfloat32_t vec_c = svdup_n_f32(0);
 
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 0);
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 90);
 
-    svst1_f32(pg, (float *)c, vec_c);
+    svst1_f32(pg, (float32_t *)c, vec_c);
 
     a += num_64bit_lanes;
     b += num_64bit_lanes;
@@ -35,23 +35,23 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
   if (tail_size) {
     pg = svwhilelt_b32(0, 2 * tail_size);
 
-    svfloat32_t vec_a = svld1_f32(pg, (const float *)a);
-    svfloat32_t vec_b = svld1_f32(pg, (const float *)b);
+    svfloat32_t vec_a = svld1_f32(pg, (const float32_t *)a);
+    svfloat32_t vec_b = svld1_f32(pg, (const float32_t *)b);
     svfloat32_t vec_c = svdup_n_f32(0);
 
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 0);
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 90);
 
-    svst1_f32(pg, (float *)c, vec_c);
+    svst1_f32(pg, (float32_t *)c, vec_c);
   }
 
   return ARMRAL_SUCCESS;
 #else
   uint32_t blk_cnt; /* Loop counter */
-  float re_a;
-  float im_a;
-  float re_b;
-  float im_b; /* Temporary variables to store real and imaginary values */
+  float32_t re_a;
+  float32_t im_a;
+  float32_t re_b;
+  float32_t im_b; /* Temporary variables to store real and imaginary values */
 
   float32x4x2_t va;
   float32x4x2_t vb;
@@ -62,8 +62,8 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
 
   while (blk_cnt > 0U) {
     // load & separate real/imag (de-interleave 2)
-    va = vld2q_f32((const float *)a);
-    vb = vld2q_f32((const float *)b);
+    va = vld2q_f32((const float32_t *)a);
+    vb = vld2q_f32((const float32_t *)b);
 
     /* Increment pointers */
     a += 4;
@@ -77,7 +77,7 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
     out_cplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
     out_cplx.val[1] = vfmaq_f32(out_cplx.val[1], va.val[1], vb.val[0]);
 
-    vst2q_f32((float *)c, out_cplx);
+    vst2q_f32((float32_t *)c, out_cplx);
 
     /* Increment pointer */
     c += 4;
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
index ac068af2d05b2bb96a0399d59c761ad874dba4e3..5357eb7606fc19fe0c9940dd8b82f8a7c2c8cf44 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
@@ -1,17 +1,18 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
 #include <arm_sve.h>
 #endif
 
-armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *restrict a_re,
-                                        const float *restrict a_im,
-                                        const float *restrict b_re,
-                                        const float *restrict b_im, float *c_re,
-                                        float *c_im) {
+armral_status armral_cmplx_vecmul_f32_2(int32_t n,
+                                        const float32_t *restrict a_re,
+                                        const float32_t *restrict a_im,
+                                        const float32_t *restrict b_re,
+                                        const float32_t *restrict b_im,
+                                        float32_t *c_re, float32_t *c_im) {
 #ifdef ARMRAL_ARCH_SVE
   int32_t num_lanes = svcntw();
   svbool_t pg = svptrue_b32();
@@ -87,10 +88,10 @@ armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *restrict a_re,
   return ARMRAL_SUCCESS;
 #else
   uint32_t blk_cnt; /* Loop counter */
-  float re_a;
-  float im_a;
-  float re_b;
-  float im_b; /* Temporary variables to store real and imaginary values */
+  float32_t re_a;
+  float32_t im_a;
+  float32_t re_b;
+  float32_t im_b; /* Temporary variables to store real and imaginary values */
 
   float32x4x2_t vc_re;
   float32x4x2_t vc_im;
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
index 617ddbafd7560c78cff01194a1ec41e8ee3c7bf2..373c25f65d48e068de0b87730eb00b001524e0de 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
index 322e0cdf46238ada687eaf9a6a1cf9d8c8326b14..5e13b7fbea0dbb49b22ef341b7b6b512c212efba 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
index 34e0c2df3efa3fe9127c664b9f72356afb71209a..76d179ffb250a673354342471107428f52edd5f5 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
index 582de2fd7870b782a5ec21d6e88c38e341231812..623c6ba6352b093d402e8a167bb743aade424db1 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
@@ -1,11 +1,12 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
+#include "../bit_unpacking_common.hpp"
 #include "utils/vec_mul.hpp"
 
-#include <stdlib.h>
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
 #endif
@@ -433,141 +434,7 @@ armral_status armral_mu_law_decompr_9bit(const uint32_t n_prb,
   }
   return ARMRAL_SUCCESS;
 #else
-  int16x8x2_t scale_v;
-  if (scale != nullptr) {
-    scale_v.val[0] = vdupq_n_s16(scale->re);
-    scale_v.val[1] = vdupq_n_s16(scale->im);
-  }
-
-  for (uint32_t i = 0; i < n_prb; i++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-    int16_t shift = src->exp;
-
-    // ABCDEFGH
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-    // BCDEFGHI
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    int8x8_t left_shifts = {0, 1, 2, 3, 4, 5, 6, 7};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    uint16x8_t a_left = vshll_n_u8(vshl_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vshl_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vshl_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-
-    int16x8x3_t prb_comp_in;
-    prb_comp_in.val[0] = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> 7;
-    prb_comp_in.val[1] = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> 7;
-    prb_comp_in.val[2] = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> 7;
-
-    // Extract the sign bit and absolute values for the PRB
-    int16x8x3_t prb_signs;
-    prb_signs.val[0] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[0]), vcgezq_s16(prb_comp_in.val[0])));
-    prb_signs.val[1] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[1]), vcgezq_s16(prb_comp_in.val[1])));
-    prb_signs.val[2] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[2]), vcgezq_s16(prb_comp_in.val[2])));
-
-    int16x8x3_t prb_comp_abs;
-    int16x8_t sat_pos = vdupq_n_s16(255);
-    prb_comp_abs.val[0] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[0]));
-    prb_comp_abs.val[1] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[1]));
-    prb_comp_abs.val[2] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[2]));
-
-    // Expand each sample, absBitWidth=15, compBitWidth=9
-    uint16x8x3_t check_thr1;
-    uint16x8x3_t check_thr3;
-
-    // Expand - First Step: Set bitmasks based on prbCompAbs values
-    // Check1: if prbCompAbs <= 2^(compBitWidth - 2) = 128
-    int16x8_t thr1_b9 = vdupq_n_s16(128);
-    check_thr1.val[0] = vcleq_s16(prb_comp_abs.val[0], thr1_b9);
-    check_thr1.val[1] = vcleq_s16(prb_comp_abs.val[1], thr1_b9);
-    check_thr1.val[2] = vcleq_s16(prb_comp_abs.val[2], thr1_b9);
-
-    // Check3: if prbCompAbs > (2^(compBitWidth - 2) + 2^(compBitWidth - 3))
-    int16x8_t thr2_b9 = vdupq_n_s16(192);
-    check_thr3.val[0] = vcgtq_s16(prb_comp_abs.val[0], thr2_b9);
-    check_thr3.val[1] = vcgtq_s16(prb_comp_abs.val[1], thr2_b9);
-    check_thr3.val[2] = vcgtq_s16(prb_comp_abs.val[2], thr2_b9);
-
-    // Expand - Second Step: Perform decompression calculation
-    int16x8x3_t prb_abs_res1;
-    int16x8x3_t prb_abs_res2;
-    int16x8x3_t prb_abs_res3;
-
-    // Case1: prbAbsRes1 = prbCompAbs * 2^(input_bits - output_bits)
-    //  input_bits - output_bits = 6
-    prb_abs_res1.val[0] = vqshlq_n_s16(prb_comp_abs.val[0], 6);
-    prb_abs_res1.val[1] = vqshlq_n_s16(prb_comp_abs.val[1], 6);
-    prb_abs_res1.val[2] = vqshlq_n_s16(prb_comp_abs.val[2], 6);
-
-    // Case2: prbAbsRes2 = prbCompAbs * 2^(input_bits - output_bits + 1) - 2^13
-    //  input_bits - output_bits + 1 = 7
-    int16x8_t sub_thr2_b9 = vdupq_n_s16(8192);
-    prb_abs_res2.val[0] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[0], 7), sub_thr2_b9);
-    prb_abs_res2.val[1] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[1], 7), sub_thr2_b9);
-    prb_abs_res2.val[2] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[2], 7), sub_thr2_b9);
-
-    // Case3: prbAbsRes3 = prbCompAbs * 2^(absBitWidth - compBitWidth + 2) -
-    // 2^15
-    //  input_bits - output_bits + 2 = 8
-    uint16x8_t sub_comm_b9 = vdupq_n_u16(32768);
-    prb_abs_res3.val[0] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[0], 8), sub_comm_b9));
-    prb_abs_res3.val[1] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[1], 8), sub_comm_b9));
-    prb_abs_res3.val[2] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[2], 8), sub_comm_b9));
-
-    // Expand - Fourth Step: OR among prbAbsRes vectors
-    int16x8x3_t exp_samples;
-    exp_samples.val[0] = vbslq_s16(
-        check_thr1.val[0], prb_abs_res1.val[0],
-        vbslq_s16(check_thr3.val[0], prb_abs_res3.val[0], prb_abs_res2.val[0]));
-    exp_samples.val[1] = vbslq_s16(
-        check_thr1.val[1], prb_abs_res1.val[1],
-        vbslq_s16(check_thr3.val[1], prb_abs_res3.val[1], prb_abs_res2.val[1]));
-    exp_samples.val[2] = vbslq_s16(
-        check_thr1.val[2], prb_abs_res1.val[2],
-        vbslq_s16(check_thr3.val[2], prb_abs_res3.val[2], prb_abs_res2.val[2]));
-
-    // Apply sign and shift
-    exp_samples.val[0] = vmulq_s16(exp_samples.val[0], prb_signs.val[0]);
-    exp_samples.val[1] = vmulq_s16(exp_samples.val[1], prb_signs.val[1]);
-    exp_samples.val[2] = vmulq_s16(exp_samples.val[2], prb_signs.val[2]);
-
-    int16x8_t comp_shift_vec = vdupq_n_s16(-shift);
-
-    exp_samples.val[0] = vshlq_s16(exp_samples.val[0], comp_shift_vec);
-    exp_samples.val[1] = vshlq_s16(exp_samples.val[1], comp_shift_vec);
-    exp_samples.val[2] = vshlq_s16(exp_samples.val[2], comp_shift_vec);
-
-    if (scale != nullptr) {
-      scale_and_store3_cmplx((int16_t *)dst, exp_samples.val[0],
-                             exp_samples.val[1], exp_samples.val[2], scale_v);
-      dst += 12;
-    } else {
-      vst1q_s16((int16_t *)dst, exp_samples.val[0]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, exp_samples.val[1]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, exp_samples.val[2]);
-      dst += 4;
-    }
-    src++;
-  }
+  common_decompr_9bit_neon<true, false>(n_prb, src, dst, scale);
   return ARMRAL_SUCCESS;
 #endif
 }
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
index 310c0170555043051bc885151229324b04e9b146..835a1081041ab86b99e7dfefe924ca0eedaa1214 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
index 7d41c34f37cd6e5b9c4de446b3fcd8ba87a938ae..495ec44203002f875d7b9fb055d5aa47613f4596 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
@@ -1,12 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
 #endif
 
+#include "../bit_unpacking_common.hpp"
 #include "utils/vec_mul.hpp"
 
 armral_status armral_block_float_decompr_8bit(
@@ -190,58 +192,7 @@ armral_status armral_block_float_decompr_9bit(
   }
   return ARMRAL_SUCCESS;
 #else
-  for (uint32_t num_prb = 0; num_prb < n_prb; num_prb++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-    int16_t exp = src->exp;
-
-    // ABCDEFGH
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-    // BCDEFGHI
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    uint8x8_t left_shifts = {1, 2, 4, 8, 16, 32, 64, 128};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    // e.g. for second lanes (x07=B, x18=C), extracting 9-bit bbbbbbbbb
-    //  x07[1] = abbbbbbb
-    //  x18[1] = bbcccccc
-    // > vshl_u8(x07, left_shifts)  = vshl_u8(abbbbbbb,  1) = bbbbbbb0
-    // > vshl_u8(x18, right_shifts) = vshl_u8(bbcccccc, -6) = 000000bb
-    // > vshll_n_u8(vshl_u8(x07, left_shifts), 8)  = bbbbbbb0_00000000
-    // > vshll_n_u8(vshl_u8(x18, right_shifts), 7) = 0000000b_b0000000
-    // note how we have populated the sign-bit of the 16-bit lane. A shift
-    // right (dealing with the block exponent) therefore preserves the sign
-    // of the original value as expected.
-    uint16x8_t a_left = vshll_n_u8(vmul_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vmul_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vmul_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-    int16x8_t a_comb = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right));
-    int16x8_t b_comb = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right));
-    int16x8_t c_comb = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right));
-    a_comb = a_comb >> (7 - exp);
-    b_comb = b_comb >> (7 - exp);
-    c_comb = c_comb >> (7 - exp);
-
-    if (scale != nullptr) {
-      vst1q_s16((int16_t *)&dst[0], cmplx_mul_combined_re_im(a_comb, *scale));
-      vst1q_s16((int16_t *)&dst[4], cmplx_mul_combined_re_im(b_comb, *scale));
-      vst1q_s16((int16_t *)&dst[8], cmplx_mul_combined_re_im(c_comb, *scale));
-    } else {
-      vst1q_s16((int16_t *)&dst[0], a_comb);
-      vst1q_s16((int16_t *)&dst[4], b_comb);
-      vst1q_s16((int16_t *)&dst[8], c_comb);
-    }
-
-    src++;
-    dst += 12;
-  }
+  common_decompr_9bit_neon<false, true>(n_prb, src, dst, scale);
   return ARMRAL_SUCCESS;
 #endif
 }
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
index 7d96bf406fb935057215f6ac15dd37cf4e3bc57c..14f7488f5454c9dd209438ee4a33485da4ce2882 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
index c8fb82a4edd6850db067abda6a948b80fbe9d329..3de46c3c1d1aa277d82e01ad2ef12b5af1439c78 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
@@ -1,11 +1,12 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
+#include "../bit_unpacking_common.hpp"
 #include "intrinsics.h"
 #include "utils/vec_mul.hpp"
-#include <math.h>
 
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
@@ -174,60 +175,7 @@ armral_status armral_block_scaling_decompr_9bit(
     src++;
   }
 #else
-  for (unsigned int i = 0; i < n_prb; i++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-
-    // Load in the input data byte by byte
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    int8x8_t left_shifts = {0, 1, 2, 3, 4, 5, 6, 7};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    uint16x8_t a_left = vshll_n_u8(vshl_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vshl_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vshl_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-
-    // Get 9bit input elements
-    int16x8_t prb_comp_in[3];
-    prb_comp_in[0] = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> 7;
-    prb_comp_in[1] = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> 7;
-    prb_comp_in[2] = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> 7;
-
-    // Decompression process
-    int16x8_t prb_decomp[3];
-    int16x8_t scaling_factor = vdupq_n_s16(src->exp);
-    prb_decomp[0] = vmulq_s16(prb_comp_in[0], scaling_factor);
-    prb_decomp[1] = vmulq_s16(prb_comp_in[1], scaling_factor);
-    prb_decomp[2] = vmulq_s16(prb_comp_in[2], scaling_factor);
-
-    // Store decompressed data
-    if (scale != nullptr) {
-      vst1q_s16((int16_t *)&dst[0],
-                cmplx_mul_combined_re_im(prb_decomp[0], *scale));
-      vst1q_s16((int16_t *)&dst[4],
-                cmplx_mul_combined_re_im(prb_decomp[1], *scale));
-      vst1q_s16((int16_t *)&dst[8],
-                cmplx_mul_combined_re_im(prb_decomp[2], *scale));
-      dst += 12;
-    } else {
-      vst1q_s16((int16_t *)dst, prb_decomp[0]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, prb_decomp[1]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, prb_decomp[2]);
-      dst += 4;
-    }
-    src++;
-  }
+  common_decompr_9bit_neon<false, false>(n_prb, src, dst, scale);
 #endif
   return ARMRAL_SUCCESS;
 }
diff --git a/src/DuRuInterface/bit_packing_common.hpp b/src/DuRuInterface/bit_packing_common.hpp
index 2c7af3b4a26fcb6484da28e5de84bd903d9ebfe3..070318a968cbc32a792e8e9f5da94a5bc1fe6800 100644
--- a/src/DuRuInterface/bit_packing_common.hpp
+++ b/src/DuRuInterface/bit_packing_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/DuRuInterface/bit_unpacking_common.hpp b/src/DuRuInterface/bit_unpacking_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d30c7dd2b0dba748d7a360fdd7a3bcdbefc806af
--- /dev/null
+++ b/src/DuRuInterface/bit_unpacking_common.hpp
@@ -0,0 +1,185 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include "utils/vec_mul.hpp"
+
+namespace {
+
+inline void mu_law_decomp_and_store_9bit_neon(int16x8x3_t &prb_comp,
+                                              armral_cmplx_int16_t *dst,
+                                              const armral_cmplx_int16_t *scale,
+                                              int16_t shift) {
+  // Extract the sign bit and absolute values for the PRB
+  int16x8x3_t prb_signs;
+  prb_signs.val[0] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[0]), vcgezq_s16(prb_comp.val[0])));
+  prb_signs.val[1] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[1]), vcgezq_s16(prb_comp.val[1])));
+  prb_signs.val[2] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[2]), vcgezq_s16(prb_comp.val[2])));
+
+  int16x8x3_t prb_comp_abs;
+  int16x8_t sat_pos = vdupq_n_s16(255);
+  prb_comp_abs.val[0] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[0]));
+  prb_comp_abs.val[1] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[1]));
+  prb_comp_abs.val[2] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[2]));
+
+  // Expand each sample, absBitWidth=15, compBitWidth=9
+  uint16x8x3_t check_thr1;
+  uint16x8x3_t check_thr3;
+
+  // Expand - First Step: Set bitmasks based on prbCompAbs values
+  // Check1: if prbCompAbs <= 2^(compBitWidth - 2) = 128
+  int16x8_t thr1_b9 = vdupq_n_s16(128);
+  check_thr1.val[0] = vcleq_s16(prb_comp_abs.val[0], thr1_b9);
+  check_thr1.val[1] = vcleq_s16(prb_comp_abs.val[1], thr1_b9);
+  check_thr1.val[2] = vcleq_s16(prb_comp_abs.val[2], thr1_b9);
+
+  // Check3: if prbCompAbs > (2^(compBitWidth - 2) + 2^(compBitWidth - 3))
+  int16x8_t thr2_b9 = vdupq_n_s16(192);
+  check_thr3.val[0] = vcgtq_s16(prb_comp_abs.val[0], thr2_b9);
+  check_thr3.val[1] = vcgtq_s16(prb_comp_abs.val[1], thr2_b9);
+  check_thr3.val[2] = vcgtq_s16(prb_comp_abs.val[2], thr2_b9);
+
+  // Expand - Second Step: Perform decompression calculation
+  int16x8x3_t prb_abs_res1;
+  int16x8x3_t prb_abs_res2;
+  int16x8x3_t prb_abs_res3;
+
+  // Case1: prbAbsRes1 = prbCompAbs * 2^(input_bits - output_bits)
+  //  input_bits - output_bits = 6
+  prb_abs_res1.val[0] = vqshlq_n_s16(prb_comp_abs.val[0], 6);
+  prb_abs_res1.val[1] = vqshlq_n_s16(prb_comp_abs.val[1], 6);
+  prb_abs_res1.val[2] = vqshlq_n_s16(prb_comp_abs.val[2], 6);
+
+  // Case2: prbAbsRes2 = prbCompAbs * 2^(input_bits - output_bits + 1) - 2^13
+  //  input_bits - output_bits + 1 = 7
+  int16x8_t sub_thr2_b9 = vdupq_n_s16(8192);
+  prb_abs_res2.val[0] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[0], 7), sub_thr2_b9);
+  prb_abs_res2.val[1] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[1], 7), sub_thr2_b9);
+  prb_abs_res2.val[2] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[2], 7), sub_thr2_b9);
+
+  // Case3: prbAbsRes3 = prbCompAbs * 2^(absBitWidth - compBitWidth + 2) -
+  // 2^15
+  //  input_bits - output_bits + 2 = 8
+  uint16x8_t sub_comm_b9 = vdupq_n_u16(32768);
+  prb_abs_res3.val[0] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[0], 8), sub_comm_b9));
+  prb_abs_res3.val[1] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[1], 8), sub_comm_b9));
+  prb_abs_res3.val[2] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[2], 8), sub_comm_b9));
+
+  // Expand - Fourth Step: OR among prbAbsRes vectors
+  int16x8x3_t exp_samples;
+  exp_samples.val[0] = vbslq_s16(
+      check_thr1.val[0], prb_abs_res1.val[0],
+      vbslq_s16(check_thr3.val[0], prb_abs_res3.val[0], prb_abs_res2.val[0]));
+  exp_samples.val[1] = vbslq_s16(
+      check_thr1.val[1], prb_abs_res1.val[1],
+      vbslq_s16(check_thr3.val[1], prb_abs_res3.val[1], prb_abs_res2.val[1]));
+  exp_samples.val[2] = vbslq_s16(
+      check_thr1.val[2], prb_abs_res1.val[2],
+      vbslq_s16(check_thr3.val[2], prb_abs_res3.val[2], prb_abs_res2.val[2]));
+
+  // Apply sign and shift
+  exp_samples.val[0] = vmulq_s16(exp_samples.val[0], prb_signs.val[0]);
+  exp_samples.val[1] = vmulq_s16(exp_samples.val[1], prb_signs.val[1]);
+  exp_samples.val[2] = vmulq_s16(exp_samples.val[2], prb_signs.val[2]);
+
+  int16x8_t comp_shift_vec = vdupq_n_s16(-shift);
+
+  exp_samples.val[0] = vshlq_s16(exp_samples.val[0], comp_shift_vec);
+  exp_samples.val[1] = vshlq_s16(exp_samples.val[1], comp_shift_vec);
+  exp_samples.val[2] = vshlq_s16(exp_samples.val[2], comp_shift_vec);
+
+  // Store
+  if (scale != nullptr) {
+    int16x8x2_t scale_v;
+    scale_v.val[0] = vdupq_n_s16(scale->re);
+    scale_v.val[1] = vdupq_n_s16(scale->im);
+    scale_and_store3_cmplx((int16_t *)dst, exp_samples.val[0],
+                           exp_samples.val[1], exp_samples.val[2], scale_v);
+  } else {
+    vst1q_s16((int16_t *)&dst[0], exp_samples.val[0]);
+    vst1q_s16((int16_t *)&dst[4], exp_samples.val[1]);
+    vst1q_s16((int16_t *)&dst[8], exp_samples.val[2]);
+  }
+}
+
+template<bool is_mu_law, bool is_block_float>
+void common_decompr_9bit_neon(uint32_t n_prb,
+                              const armral_compressed_data_9bit *src,
+                              armral_cmplx_int16_t *dst,
+                              const armral_cmplx_int16_t *scale) {
+  for (uint32_t i = 0; i < n_prb; i++) {
+    int16_t exp = src->exp;
+    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
+
+    // Load in the input data byte by byte
+    // ABCDEFGH
+    uint8x8_t a07 = vld1_u8(&data_in[0]);
+    uint8x8_t b07 = vld1_u8(&data_in[9]);
+    uint8x8_t c07 = vld1_u8(&data_in[18]);
+    // BCDEFGHI
+    uint8x8_t a18 = vld1_u8(&data_in[1]);
+    uint8x8_t b18 = vld1_u8(&data_in[10]);
+    uint8x8_t c18 = vld1_u8(&data_in[19]);
+
+    uint8x8_t left_shifts = {1, 2, 4, 8, 16, 32, 64, 128};
+    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
+
+    uint16x8_t a_left = vshll_n_u8(vmul_u8(a07, left_shifts), 8);
+    uint16x8_t b_left = vshll_n_u8(vmul_u8(b07, left_shifts), 8);
+    uint16x8_t c_left = vshll_n_u8(vmul_u8(c07, left_shifts), 8);
+    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
+    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
+    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
+
+    int16x8x3_t prb_comp;
+    int16_t shift = 7;
+    if constexpr (is_block_float) {
+      shift -= exp;
+    }
+    prb_comp.val[0] =
+        vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> shift;
+    prb_comp.val[1] =
+        vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> shift;
+    prb_comp.val[2] =
+        vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> shift;
+
+    if constexpr (is_mu_law) {
+      mu_law_decomp_and_store_9bit_neon(prb_comp, dst, scale, exp);
+    } else {
+      if constexpr (not is_block_float) { // Block Scaling
+        prb_comp.val[0] = vmulq_n_s16(prb_comp.val[0], exp);
+        prb_comp.val[1] = vmulq_n_s16(prb_comp.val[1], exp);
+        prb_comp.val[2] = vmulq_n_s16(prb_comp.val[2], exp);
+      }
+
+      // Block Float and Scaling store
+      if (scale != nullptr) {
+        vst1q_s16((int16_t *)&dst[0],
+                  cmplx_mul_combined_re_im(prb_comp.val[0], *scale));
+        vst1q_s16((int16_t *)&dst[4],
+                  cmplx_mul_combined_re_im(prb_comp.val[1], *scale));
+        vst1q_s16((int16_t *)&dst[8],
+                  cmplx_mul_combined_re_im(prb_comp.val[2], *scale));
+      } else {
+        vst1q_s16((int16_t *)&dst[0], prb_comp.val[0]);
+        vst1q_s16((int16_t *)&dst[4], prb_comp.val[1]);
+        vst1q_s16((int16_t *)&dst[8], prb_comp.val[2]);
+      }
+    }
+    dst += 12;
+    src++;
+  }
+}
+
+} // namespace
\ No newline at end of file
diff --git a/src/LowerPHY/Correlation/arm_correlation.c b/src/LowerPHY/Correlation/arm_correlation.c
index 71dce495d3c67c6f8e14e064cf37373c75289586..85cca8c13606ce775210f547f27c22f36ef0f996 100644
--- a/src/LowerPHY/Correlation/arm_correlation.c
+++ b/src/LowerPHY/Correlation/arm_correlation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp
index 830bb0239b1e2d402047253689f4fab04847fc9f..28cbe6f9514998eb6ab29ac8cd99715593af6255 100644
--- a/src/LowerPHY/FFT/fft_cf32.cpp
+++ b/src/LowerPHY/FFT/fft_cf32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_execute.hpp"
 #include "fft_plan.hpp"
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
index bf5176b8c123ebbee9351f2be7261d627222ba22..72cc33ca6555cd98840b69fe136789800116b5ac 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gs.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
index 98033b7e4988b00db4b80038909edacb9bf0d6b9..ba99f76072c9030e9b323d03c15456ca6537dd63 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
index 23c5797c849a16d2a992de454f55802a99662166..c70d093a5112e1cb65c78c438f961904584fbc56 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
index 8edbe4603f4ed6c016be3f84c48777b466a8d290..f6bf005a4178e065142091c854080bad190eb17c 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
index a61ff10b7e39fcb31192749fdc4dc4723364decb..1b42288739ad34d7eeb7b6d5ec8eb88a592e8999 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
index 57014ea8e384b7a651f83a447b6fd9e1ae4c24dd..3c6966f3555bde2fb4fe238dc23003aadd24d1a6 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
index cd7b9b1cc3965bfec80083f4ed4038c8cf35e7c0..84ba9f8e11c2522d20c4e4f7046c24b194d1f1af 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
index 9b78818cb0a752c60b3a344c140d72b267d11e86..dd75a5537ab29d0a024f34b1c9da641ce20afd96 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
index 53ef2836df77de1ff664d0aeadd949f7cf1a9286..72e53ca3e1fc0d78c88020e4edb1f49c98133148 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_t_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
index 37e39fdb6452b37a9f4f6dd896470ba6248f0acf..88f8678fc205d4542de74743f27abf9dd1dd0cb3 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
index 86d9544f013d9258314ffb61ec46f0f862ae5675..d46ed9d262a485367fa3617c355ccca55d4eb6fd 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ab_t_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
index aaba87482ad180df6e4b640329b45304d13459c6..896851eb572841645d9efdad724492b4617731c7 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
index 33d7282ad7d9d2584cc737d7db2e2df3f362990a..22ab0d6e819066b2c1837c70521b79cf89cf5944 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
index 8bbb2dee041ca4ce1c8f8cd28abb2a14eb2009a5..85fe5a07a56be409546ac9e440f0722216c71780 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
index cac45e521448cc54c9def99fd943e868f7f3eb8b..d656718b9e134cb8770cb29fa6bc1111972c785b 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cf32_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
index 9f0f294bb7306a1914f66681128cbb8a8831e973..fcef99b1fbcb91f583521224559a549e2cf0e669 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp
index 2da312ac658911449b215da9769f31be92b4b683..c9cb8f797a504786960361cb5a1ddd08ff70a40f 100644
--- a/src/LowerPHY/FFT/fft_cs16.cpp
+++ b/src/LowerPHY/FFT/fft_cs16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
index ecb656613d38d799e19a64538511ae1dd39b6561..d00d3766c37d291b1c9b211ba1db2c2a967be6c5 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cf32_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
index fe8b7508f48c0d880a18e9ab55a6e1dc88f23d60..8b47abc150d1d22e9d83e935eea8bbe58ab0d5ee 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
index 609bf1df3712a817bc8d2527054742fba3e70839..5ccc3e69c18aa52d0f494c54e8f00681ccd43029 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cs16_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
index 163f8631af756178af6c6919622c069d9933d34e..756cff9a8840fcb83bfe732f088fa2ff850470c2 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
index 20287bc2c9c459a0020f2eb6e719ccb7b8352960..349d5488a6dd184b88c41a778a3390b4e9099b96 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cs16_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
index 8476f0ed6d46bb5dbb5121a70875534fa9658d99..864b0d9604bb7f034ee24564d4ba404b078bdfe3 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp
index 02313339e585ec1e84f05c5b2458b02fd11a3d63..f45d07fef36e86cbaf4237d40fa63cd8e0a43b14 100644
--- a/src/LowerPHY/FFT/fft_execute.cpp
+++ b/src/LowerPHY/FFT/fft_execute.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_execute.hpp b/src/LowerPHY/FFT/fft_execute.hpp
index 714d2575a3a52641952dc553ce57207faab1c047..4cf5edd6dcfd16ae8a6b287a192b6915f3175e8b 100644
--- a/src/LowerPHY/FFT/fft_execute.hpp
+++ b/src/LowerPHY/FFT/fft_execute.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_helper.h b/src/LowerPHY/FFT/fft_helper.h
index 98f7c51ac48c49cd4d46e13e0286f9ab3eb17a92..978ed19b3a23e5669aca3b8b76bf7e4973fabb32 100644
--- a/src/LowerPHY/FFT/fft_helper.h
+++ b/src/LowerPHY/FFT/fft_helper.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_level.cpp b/src/LowerPHY/FFT/fft_level.cpp
index a4402dc3658e6e4d6bdf462547c868548bc278c8..e91517d82d660cc0fd2a80c17dd6b9ed214a5ac7 100644
--- a/src/LowerPHY/FFT/fft_level.cpp
+++ b/src/LowerPHY/FFT/fft_level.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_level.hpp"
 
diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp
index 06cc0df09f3d0e3838b75c33426236b0520a5c32..fa0e64e75b6f11093428bbee964a23b16414f2e6 100644
--- a/src/LowerPHY/FFT/fft_level.hpp
+++ b/src/LowerPHY/FFT/fft_level.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp
index 833680a09ad7f7439fec55200435c071678d09cd..4b10a619bb3b298c2ab6b72beebe59dfcc1ce2da 100644
--- a/src/LowerPHY/FFT/fft_plan.cpp
+++ b/src/LowerPHY/FFT/fft_plan.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_plan.hpp"
 #include "fft_cf32_kernel_lookup.h"
@@ -305,7 +305,7 @@ template<typename Tx, typename Ty, typename Tw>
 int factorize(int n, armral_fft_direction_t dir, int max_levels,
               armral::fft::lev_base_t **levels) {
   // search through the set of supported factors to find a suitable
-  // factorisation, then use that to build the level data structures.
+  // factorization, then use that to build the level data structures.
   int factors[max_levels];
   int num_factors = factorize_descending(n, dir, max_levels, factors);
   if (num_factors == 0) {
diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp
index 419622390f395d13b37454b3df54e61fca6dd802..b793eb2a16aa8022cc71e0b5ff7b736e02021ba0 100644
--- a/src/LowerPHY/FFT/fft_plan.hpp
+++ b/src/LowerPHY/FFT/fft_plan.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_types.hpp b/src/LowerPHY/FFT/fft_types.hpp
index 65ebfa64dea0408242f43e1bc5db50909037ab98..9cc619913aa262aa7657b3831cc074e4e530204a 100644
--- a/src/LowerPHY/FFT/fft_types.hpp
+++ b/src/LowerPHY/FFT/fft_types.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp
index a05479cfc94daf8f4aaebf4107813ab6f03da2d4..1678c6a882e4b9ba740a1b8a52d99eee5063fa61 100644
--- a/src/LowerPHY/FFT/rader.cpp
+++ b/src/LowerPHY/FFT/rader.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rader.hpp"
diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp
index 6d1d21f004443425dd6b38d00718d0fe3c1e1e05..bbe53a128b94062775850396790430acadcbe0f8 100644
--- a/src/LowerPHY/FFT/rader.hpp
+++ b/src/LowerPHY/FFT/rader.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader_generator.cpp b/src/LowerPHY/FFT/rader_generator.cpp
index 89e138630b5646cd6e2f329a27b9b30a09a9994b..9e798f2df322ec8414452d78825e3d7a2dd1acd9 100644
--- a/src/LowerPHY/FFT/rader_generator.cpp
+++ b/src/LowerPHY/FFT/rader_generator.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "rader_generator.hpp"
 
diff --git a/src/LowerPHY/FFT/rader_generator.hpp b/src/LowerPHY/FFT/rader_generator.hpp
index bc219d9c100719b16dae1c63859d79564d9057de..49b3cfda30910455756dd48acc33976ee22f02ba 100644
--- a/src/LowerPHY/FFT/rader_generator.hpp
+++ b/src/LowerPHY/FFT/rader_generator.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32.c b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
index 428e2312d704d3b845356c3fd1c0045730c4c087..04ffc612e47d482f0b83a2c9fbe94d587359f764 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -10,8 +10,8 @@
 
 #ifdef ARMRAL_ARCH_SVE
 
-static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float *in,
-                                          const float *coeffs,
+static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float32_t *in,
+                                          const float32_t *coeffs,
                                           uint32_t n_taps) {
   // Compute FIR for four vector-lengths of data. Coeffs array is
   // unrolled by 2 and we have 2 accumulators per vector length, as
@@ -110,10 +110,10 @@ static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float *in,
   return svcreate4(y1, y2, y3, y4);
 }
 
-static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float *in,
-                                          const float *coeffs,
+static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float32_t *in,
+                                          const float32_t *coeffs,
                                           uint32_t n_taps) {
-  // Compute FIR for 2 vector-lengths of data. Lightly optimised - this
+  // Compute FIR for 2 vector-lengths of data. Lightly optimized - this
   // function will be called at most once per call of
   // arm_fir_filter_cf32. Coefficient array is unrolled by factor 2, as
   // for fir_sve_blk, with the difference that we have two accumulators
@@ -167,10 +167,11 @@ static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float *in,
   return svcreate2(y1, y2);
 }
 
-static inline svfloat32_t fir_sve_blk(svbool_t pg, const float *in,
-                                      const float *coeffs, uint32_t n_taps) {
+static inline svfloat32_t fir_sve_blk(svbool_t pg, const float32_t *in,
+                                      const float32_t *coeffs,
+                                      uint32_t n_taps) {
   // Compute FIR for one vector-length of data. This version is not
-  // really optimised, as it is only ever used as the tail of the more
+  // really optimized, as it is only ever used as the tail of the more
   // heavily unrolled versions above. The loop over the coeffs array is
   // unrolled by factor 2, since we can fit 2 complex values in a
   // quad-word.
@@ -210,9 +211,9 @@ armral_status armral_fir_filter_cf32(uint32_t size, uint32_t taps,
   svbool_t ptrue_b32 = svptrue_b32();
   uint32_t x_blk_idx = 0;
   uint32_t xinc = svcntw() * 2;
-  const float *c = (const float *)coeffs;
-  const float *in = (const float *)input;
-  float *out = (float *)output;
+  const float32_t *c = (const float32_t *)coeffs;
+  const float32_t *in = (const float32_t *)input;
+  float32_t *out = (float32_t *)output;
 
   for (; x_blk_idx + xinc * 2 < size * 2; x_blk_idx += xinc * 2) {
     svfloat32x4_t y = fir_sve_blk_4(ptrue_b32, in + x_blk_idx, c, taps);
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
index e60e5e0e3f520c9ce527aacd83ea44ca05da585b..ebeef5def330ae6532168d19b2f8e08817e7ad8a 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -31,7 +31,7 @@ static inline void sv_fir_block(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
 
     // ld2_u64 allows us to separate even and odd complex elements, for example
     // if i == 0 then:
@@ -57,7 +57,7 @@ static inline void sv_fir_block(svbool_t pg,
     y = sv_full_cmla(pg, y, c, x);
   }
 
-  svst1_f32(pg, (float *)out, y);
+  svst1_f32(pg, (float32_t *)out, y);
 }
 
 static inline void sv_fir_block_2(svbool_t pg,
@@ -84,7 +84,7 @@ static inline void sv_fir_block_2(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
 
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svuint64x2_t x1 = svld2_vnum_u64(pg, &in[i], 2);
@@ -116,8 +116,8 @@ static inline void sv_fir_block_2(svbool_t pg,
     y_1 = sv_full_cmla(pg, y_1, c, x_1);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
 }
 
 static inline void sv_fir_block_4(svbool_t pg,
@@ -142,7 +142,7 @@ static inline void sv_fir_block_4(svbool_t pg,
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
 
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svfloat32_t x0_0 = svreinterpret_f32_u64(svget2(x0, 0));
     svfloat32_t x1_0 = svreinterpret_f32_u64(svget2(x0, 1));
@@ -221,10 +221,10 @@ static inline void sv_fir_block_4(svbool_t pg,
     y_3 = sv_full_cmla(pg, y_3, c, x_3);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
-  svst1_vnum_f32(pg, (float *)out, 2, y_2);
-  svst1_vnum_f32(pg, (float *)out, 3, y_3);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
+  svst1_vnum_f32(pg, (float32_t *)out, 2, y_2);
+  svst1_vnum_f32(pg, (float32_t *)out, 3, y_3);
 }
 
 static inline void sv_fir_block_8(svbool_t pg,
@@ -248,7 +248,7 @@ static inline void sv_fir_block_8(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svfloat32_t x0_0 = svreinterpret_f32_u64(svget2(x0, 0));
     svfloat32_t x0_1 = svreinterpret_f32_u64(svget2(x0, 1));
@@ -361,14 +361,14 @@ static inline void sv_fir_block_8(svbool_t pg,
     y_7 = sv_full_cmla(pg, y_7, c, x_7);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
-  svst1_vnum_f32(pg, (float *)out, 2, y_2);
-  svst1_vnum_f32(pg, (float *)out, 3, y_3);
-  svst1_vnum_f32(pg, (float *)out, 4, y_4);
-  svst1_vnum_f32(pg, (float *)out, 5, y_5);
-  svst1_vnum_f32(pg, (float *)out, 6, y_6);
-  svst1_vnum_f32(pg, (float *)out, 7, y_7);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
+  svst1_vnum_f32(pg, (float32_t *)out, 2, y_2);
+  svst1_vnum_f32(pg, (float32_t *)out, 3, y_3);
+  svst1_vnum_f32(pg, (float32_t *)out, 4, y_4);
+  svst1_vnum_f32(pg, (float32_t *)out, 5, y_5);
+  svst1_vnum_f32(pg, (float32_t *)out, 6, y_6);
+  svst1_vnum_f32(pg, (float32_t *)out, 7, y_7);
 }
 
 #endif
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16.c b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
index 71d03c1d423a1d6ffb0a1a065731c469006ba4ff..c4fa6954f57e19958a8a0e968ca1cb9ccbf72e77 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
index 0bb694772c7dab3a574fe09cb668ebf0f757622f..302c4464af0c7ddd33eaf5e569a712c964dcc7be 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -350,7 +350,7 @@ static inline void sv_fir_block(svbool_t pg,
                                 const armral_cmplx_int16_t *restrict coeffs,
                                 armral_cmplx_int16_t *out, uint32_t taps) {
   // Compute FIR on one vector-length of data (read 2 vector-lengths, write 1).
-  // This version is only used as a tail for the more heavily optimised,
+  // This version is only used as a tail for the more heavily optimized,
   // unrolled versions above.
   const uint32_t *in = (const uint32_t *)input;
 
diff --git a/src/LowerPHY/Scrambling/arm_scrambling.cpp b/src/LowerPHY/Scrambling/arm_scrambling.cpp
index 3ff12f615bc0927f8a6fde3a34babd90a015c771..b9f1812873eef1b18e53b685ffd92f50e166ef9f 100644
--- a/src/LowerPHY/Scrambling/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/arm_scrambling.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
index 98452a053fe1b7adef904271796e7b55c5148fbf..d332880da5e29f0d126f105d25794936ed5ade3c 100644
--- a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+++ b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/SVD/arm_svd.cpp b/src/MatrixFactorizations/SVD/arm_svd.cpp
similarity index 90%
rename from src/SVD/arm_svd.cpp
rename to src/MatrixFactorizations/SVD/arm_svd.cpp
index 1d24eeb1a66f450e45c2875f31bf827cad92eeea..b9e6cb13cd018936c2cc5e1e57743b0d61c0d9b9 100644
--- a/src/SVD/arm_svd.cpp
+++ b/src/MatrixFactorizations/SVD/arm_svd.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -35,10 +35,10 @@ inline void cmplx_vecdot_conj_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
   int32_t i = 0;
   for (; i * num_lanes <= n - 2 * num_lanes; i += 2) {
     svbool_t pg = svptrue_b32();
-    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float *)p_src_b, i);
-    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float *)p_src_a, i + 1);
-    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float *)p_src_b, i + 1);
+    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
+    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i + 1);
+    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i + 1);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_b0, vec_a0, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_b0, vec_a0, 270);
@@ -48,8 +48,8 @@ inline void cmplx_vecdot_conj_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
 
   for (; i * num_lanes < n; ++i) {
     svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * n);
-    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float *)p_src_b, i);
+    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_b, vec_a, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_b, vec_a, 270);
@@ -218,8 +218,8 @@ inline void cmplx_axmy_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
 // in division by a small floating point number.
 // Epsilon is taken to be 2^{-(p-1)}/2, p=24 for float.
 // We use the same value of epsilon used in LAPACK.
-const float eps = 5.96046E-08;
-const float safemin = 1.17549E-38 / eps;
+const float32_t eps = 5.96046E-08;
+const float32_t safemin = 1.17549E-38 / eps;
 
 // Compute a * b
 inline armral_cmplx_f32_t mult_cf32(armral_cmplx_f32_t a,
@@ -280,12 +280,12 @@ inline armral_cmplx_f32_t mult_add_cf32(armral_cmplx_f32_t a,
 }
 
 // Compute a * conj(a)
-inline float square_conj_cf32(armral_cmplx_f32_t a) {
+inline float32_t square_conj_cf32(armral_cmplx_f32_t a) {
   return a.re * a.re + a.im * a.im;
 }
 
 inline armral_cmplx_f32_t inv_cf32(armral_cmplx_f32_t a) {
-  float tmp = a.re * a.re + a.im * a.im;
+  float32_t tmp = a.re * a.re + a.im * a.im;
   return {a.re / tmp, -a.im / tmp};
 }
 
@@ -298,7 +298,7 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
 
   armral_cmplx_f32_t alpha = aii;
   // Sum of x[i] * conj(x[i])
-  float sum = 0.0F;
+  float32_t sum = 0.0F;
   for (int i = 0; i < n * incx; i += incx) {
     sum += square_conj_cf32(x[i]);
   }
@@ -310,11 +310,11 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
   // Add alpha * conj(alpha) to sum
   // to compute the 2 norm of the full vector
   sum += square_conj_cf32(alpha);
-  float beta = -copysign(sqrt(sum), alpha.re);
-  float rsafemin = 1.0F / safemin;
+  float32_t beta = -copysign(sqrt(sum), alpha.re);
+  float32_t rsafemin = 1.0F / safemin;
   int cnt = 0;
   int max_attempt = 10;
-  float scale = 1.0F;
+  float32_t scale = 1.0F;
   // Check if beta is small enough to induce
   // overflow when taking the inverse, and
   // if it is the case, scale to avoid overflow
@@ -343,10 +343,10 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
   armral_cmplx_f32_t tau;
   tau.re = (beta - alpha.re) / beta;
   tau.im = -alpha.im / beta;
-  armral_cmplx_f32_t normalisation_factor =
+  armral_cmplx_f32_t normalization_factor =
       inv_cf32({alpha.re - beta, alpha.im});
   for (int i = 0; i < n * incx; i += incx) {
-    x[i] = mult_cf32(normalisation_factor, x[i]);
+    x[i] = mult_cf32(normalization_factor, x[i]);
   }
   beta /= scale;
   aii = {beta, 0.0F};
@@ -354,7 +354,8 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
 }
 
 // Computation of Givens rotation components.
-inline void rotg(float f, float g, float &cs, float &sn, float &r) {
+inline void rotg(float32_t f, float32_t g, float32_t &cs, float32_t &sn,
+                 float32_t &r) {
   if (f == 0) {
     cs = 0.0F;
     sn = 1.0F;
@@ -362,15 +363,15 @@ inline void rotg(float f, float g, float &cs, float &sn, float &r) {
     return;
   }
   if (std::abs(f) > std::abs(g)) {
-    float t = g / f;
-    float tt = sqrt(1 + t * t);
+    float32_t t = g / f;
+    float32_t tt = sqrt(1 + t * t);
     cs = 1 / tt;
     sn = t / tt;
     r = f * tt;
     return;
   }
-  float t = f / g;
-  float tt = sqrt(1 + t * t);
+  float32_t t = f / g;
+  float32_t tt = sqrt(1 + t * t);
   sn = 1 / tt;
   cs = t / tt;
   r = g * tt;
@@ -379,8 +380,9 @@ inline void rotg(float f, float g, float &cs, float &sn, float &r) {
 // This routine updates singular vectors
 // by applying the Givens rotations
 // used to update the bidiagonal matrix
-inline void update_sigvect(int m, float cs, float sn, armral_cmplx_f32_t *v1,
-                           armral_cmplx_f32_t *v2, int incv) {
+inline void update_sigvect(int m, float32_t cs, float32_t sn,
+                           armral_cmplx_f32_t *v1, armral_cmplx_f32_t *v2,
+                           int incv) {
   for (int i = 0; i < m * incv; i += incv) {
     auto t = v1[i];
     v1[i].re = cs * t.re + sn * v2[i].re;
@@ -390,7 +392,7 @@ inline void update_sigvect(int m, float cs, float sn, armral_cmplx_f32_t *v1,
   }
 }
 
-// householder_qr computes the QR factorisation A = QR.
+// householder_qr computes the QR factorization A = QR.
 // On exit, the elements on and above the diagonal
 // of the A contain the upper triangular matrix R.
 // The elements below the diagonal, with the array tau,
@@ -426,8 +428,8 @@ armral_status armral_householder_qr(int m, int n, armral_cmplx_f32_t *a,
   return ARMRAL_SUCCESS;
 }
 
-// Generate explicitly Q from QR factorisation or from
-// the bidiagonalisation A = Q * B * P^H
+// Generate explicitly Q from QR factorization or from
+// the bidiagonalization A = Q * B * P^H
 armral_status armral_assemble_q(int m, int n, const armral_cmplx_f32_t *a,
                                 const armral_cmplx_f32_t *tau,
                                 armral_cmplx_f32_t *q) {
@@ -469,7 +471,7 @@ armral_status armral_assemble_q(int m, int n, const armral_cmplx_f32_t *a,
 }
 
 // Generate the orthogonal matrix P from
-// the bidiagonalisation  A = Q * B * P^H,
+// the bidiagonalization  A = Q * B * P^H,
 // note that P^H is generated directly
 // instead of P
 void armral_assemble_p(int m, int n, const armral_cmplx_f32_t *a,
@@ -539,8 +541,8 @@ void armral_assemble_p(int m, int n, const armral_cmplx_f32_t *a,
 // the bidiagonal matrix B. Note that this routine
 // returns directly the conjugate transpose of the
 // left orthogonal matrix.
-armral_status armral_bidiagonalisation(int m, int n, armral_cmplx_f32_t *a,
-                                       float *d, float *e,
+armral_status armral_bidiagonalization(int m, int n, armral_cmplx_f32_t *a,
+                                       float32_t *d, float32_t *e,
                                        armral_cmplx_f32_t *tauq,
                                        armral_cmplx_f32_t *taup) {
   if (m < n) {
@@ -622,7 +624,8 @@ armral_status armral_bidiagonalisation(int m, int n, armral_cmplx_f32_t *a,
 // "Singular Value Decomposition and Least Squares Solutions"
 //  published in Numer. Math. 14, 403--420 (1970).
 armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
-                                    float *d, float *e, armral_cmplx_f32_t *u,
+                                    float32_t *d, float32_t *e,
+                                    armral_cmplx_f32_t *u,
                                     armral_cmplx_f32_t *vt, int u_stride) {
 
   if (m < n) {
@@ -638,14 +641,14 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
 
   // Compute the 1-norm of the bidiagonal matrix
   // for the computation of the stopping criteria.
-  float anorm = 0;
+  float32_t anorm = 0;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(d[i]) + std::abs(e[i]);
+    float32_t tmp = std::abs(d[i]) + std::abs(e[i]);
     if (anorm < tmp) {
       anorm = tmp;
     }
   }
-  float tol = anorm * eps;
+  float32_t tol = anorm * eps;
 
   int maxiter = n * n;
   // Loop over the columns
@@ -675,16 +678,16 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
       // In this case, an extra sequence of Givens rotations is
       // applied from the left to annihilate the off-diagonal E[next_col].
       if (diag_is_zero) {
-        float cs = 0.0;
-        float sn = 1.0;
+        float32_t cs = 0.0;
+        float32_t sn = 1.0;
         for (int i = next_col; i < curr_col; i++) {
-          float f = sn * e[i];
+          float32_t f = sn * e[i];
           e[i] *= cs;
           if (std::abs(f) <= tol) {
             break;
           }
-          float g = d[i];
-          float h;
+          float32_t g = d[i];
+          float32_t h;
           rotg(f, g, cs, sn, h);
           d[i] = h;
           // Update left singular vectors.
@@ -694,7 +697,7 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
           }
         }
       }
-      float z = d[curr_col];
+      float32_t z = d[curr_col];
       if (next_col == curr_col) {
         // Make singular value nonnegative and update
         // the corresponding right singular vectors.
@@ -722,20 +725,20 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
       // the 2 eigenvalues are (d1 + d2)/2 +/- sqrt(((d1 - d2)/2)^2 + e1^2).
       // The choice of this shift accelerates the convergence of the
       // most bottom off-diagonal E[curr_col] to zero.
-      float x = d[next_col];
-      float y = d[curr_col - 1];
-      float g = e[curr_col - 1];
-      float h = e[curr_col];
+      float32_t x = d[next_col];
+      float32_t y = d[curr_col - 1];
+      float32_t g = e[curr_col - 1];
+      float32_t h = e[curr_col];
       // a^2 - b^2 operations are computed as
       // (a - b)* (a + b) to avoid overflow.
-      float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      float32_t f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = sqrt(f * f + 1);
       f = ((x - z) * (x + z) + h * (y / (f + copysign(g, f)) - h)) / x;
 
       // Shifted QR iteration, bulge chasing, applying
       // successive Givens rotations from right then from left.
-      float c = 1.0F;
-      float s = 1.0F;
+      float32_t c = 1.0F;
+      float32_t s = 1.0F;
       for (int i = next_col + 1; i <= curr_col; i++) {
         g = e[i];
         y = d[i];
@@ -798,7 +801,7 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the matrix A that has been factorised into QR or bidiagonalisation.
+// as the matrix A that has been factorized into QR or bidiagonalization.
 struct apply_q_work_buffers {
   armral_cmplx_f32_t *q;
 };
@@ -838,7 +841,7 @@ inline armral_status armral_apply_q(int m, int n, const armral_cmplx_f32_t *a,
 // matrix to a triangular form.
 inline int threshold_svd_qr(bool vector_needed, int m, int n) {
 
-  float crossover_point;
+  float32_t crossover_point;
   if (vector_needed) {
     // In this case, the computational complexities are:
     // 14 * m * n^2 + 8 * n^3 for direct svd,
@@ -859,14 +862,14 @@ inline int threshold_svd_qr(bool vector_needed, int m, int n) {
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A in 4 steps.
-// 1- QR factorisation of A.
-// 2- Bidiagonalisation of R.
+// 1- QR factorization of A.
+// 2- Bidiagonalization of R.
 // 3- SVD of the bidiagonal matrix from R.
 // 4- Update of the left singular vectors
 // with the orthogonal matrix from QR.
 template<typename Allocator>
 armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
-                            armral_cmplx_f32_t *a, float *s,
+                            armral_cmplx_f32_t *a, float32_t *s,
                             armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
                             Allocator &allocator) {
 
@@ -876,7 +879,7 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
   auto r = allocate_zeroed<armral_cmplx_f32_t>(allocator, n * n);
   auto tauq = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
   auto taup = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
-  auto e = allocate_uninitialized<float>(allocator, n);
+  auto e = allocate_uninitialized<float32_t>(allocator, n);
 
   // u1 and q have the same type as r, so we can reuse that pointer type.
   using cmplx_ptr = decltype(r);
@@ -904,8 +907,8 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
       r_mat(i, j) = a_mat(i, j);
     }
   }
-  // Bidiagonalisation of R.
-  armral_bidiagonalisation(n, n, r.get(), s, e.get(), tauq.get(), taup.get());
+  // Bidiagonalization of R.
+  armral_bidiagonalization(n, n, r.get(), s, e.get(), tauq.get(), taup.get());
 
   // Generate left and right orthogonal vectors.
   if (maybe_u1.has_value()) {
@@ -921,7 +924,7 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
       }
     }
 
-    // Initialise  last n*(m-n) elements of u
+    // Initialize  last n*(m-n) elements of u
     // to zero in case it is not.
     int remainder = m - n;
     for (int j = 0; j < n; j++) {
@@ -946,18 +949,19 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
 
 template<typename Allocator>
 armral_status armral_svd(bool gen_singular_vect, int m, int n,
-                         armral_cmplx_f32_t *a, float *s, armral_cmplx_f32_t *u,
-                         armral_cmplx_f32_t *vt, Allocator &allocator) {
-  // Bidiagonalisation: A = Q * B * P^H.
+                         armral_cmplx_f32_t *a, float32_t *s,
+                         armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
+                         Allocator &allocator) {
+  // Bidiagonalization: A = Q * B * P^H.
   auto tauq = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
   auto taup = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
-  auto e = allocate_uninitialized<float>(allocator, n);
+  auto e = allocate_uninitialized<float32_t>(allocator, n);
 
   if constexpr (Allocator::is_counting) {
     return ARMRAL_SUCCESS;
   }
 
-  armral_bidiagonalisation(m, n, a, s, e.get(), tauq.get(), taup.get());
+  armral_bidiagonalization(m, n, a, s, e.get(), tauq.get(), taup.get());
 
   // Generate left and right orthogonal vectors if required.
   if (gen_singular_vect) {
@@ -976,11 +980,11 @@ armral_status armral_svd(bool gen_singular_vect, int m, int n,
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix. It either performs
 // a direct SVD decomposition of the input matrix,
-// or performs QR factorisation first followed
+// or performs QR factorization first followed
 // by the SVD of R depending on the ratio m/n.
 template<typename Allocator>
 armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
-                              armral_cmplx_f32_t *a, float *s,
+                              armral_cmplx_f32_t *a, float32_t *s,
                               armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
                               Allocator &allocator) {
 
@@ -995,14 +999,14 @@ armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
 } // anonymous namespace
 
 armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
-                              armral_cmplx_f32_t *a, float *s,
+                              armral_cmplx_f32_t *a, float32_t *s,
                               armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt) {
   heap_allocator allocator{};
   return armral_svd_cf32(gen_singular_vect, m, n, a, s, u, vt, allocator);
 }
 
 armral_status armral_svd_cf32_noalloc(bool gen_singular_vect, int m, int n,
-                                      armral_cmplx_f32_t *a, float *s,
+                                      armral_cmplx_f32_t *a, float32_t *s,
                                       armral_cmplx_f32_t *u,
                                       armral_cmplx_f32_t *vt, void *buffer) {
   buffer_bump_allocator allocator{buffer};
diff --git a/src/SVD/matrix_view.hpp b/src/MatrixFactorizations/SVD/matrix_view.hpp
similarity index 83%
rename from src/SVD/matrix_view.hpp
rename to src/MatrixFactorizations/SVD/matrix_view.hpp
index cc2c4d84ec6f1bef8f3d9cbc9bcb1f5e16325e6c..67474185f8caa94fcca7e9e1a128b432e31af2da 100644
--- a/src/SVD/matrix_view.hpp
+++ b/src/MatrixFactorizations/SVD/matrix_view.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/UpperPHY/CRC/arm_crc11.cpp b/src/UpperPHY/CRC/arm_crc11.cpp
index c65f3b11bff6f03f1b9b192d59ad5628a5dee658..d41889d79d8d6a4818e5116a44e5f028d2c26db3 100644
--- a/src/UpperPHY/CRC/arm_crc11.cpp
+++ b/src/UpperPHY/CRC/arm_crc11.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc16.cpp b/src/UpperPHY/CRC/arm_crc16.cpp
index 42204c3985b97d30eeaf1140d802948c6ef387ca..e727c607acee5bf70ec27d1c2491178bc24e6d63 100644
--- a/src/UpperPHY/CRC/arm_crc16.cpp
+++ b/src/UpperPHY/CRC/arm_crc16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_a.cpp b/src/UpperPHY/CRC/arm_crc24_a.cpp
index 3eac9c4e735ff02b954feb984534175fef04e80d..af8e43e80047e35204d05ae37d8d87abe62e4ccd 100644
--- a/src/UpperPHY/CRC/arm_crc24_a.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_a.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_b.cpp b/src/UpperPHY/CRC/arm_crc24_b.cpp
index 6de6116c46ec313c742e67615c15d42ad77ce4bf..b0e9023279d4bb90aa740f73706b0f2e3a11d76d 100644
--- a/src/UpperPHY/CRC/arm_crc24_b.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_b.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_c.cpp b/src/UpperPHY/CRC/arm_crc24_c.cpp
index 0e5e4a77525de93a9bdc771f0d96004e52cb10e5..42302a58ddc8e06543081b599fe7b7c5c7e23f77 100644
--- a/src/UpperPHY/CRC/arm_crc24_c.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_c.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc6.cpp b/src/UpperPHY/CRC/arm_crc6.cpp
index f907683ebb9a181af569d8fba4a27db2cc73fbc1..0277ba31e64634f1c4e54a21763cefa3b9fe5819 100644
--- a/src/UpperPHY/CRC/arm_crc6.cpp
+++ b/src/UpperPHY/CRC/arm_crc6.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/crc_basic.hpp b/src/UpperPHY/CRC/crc_basic.hpp
index 7c3dfcdad1f3c526917555dcacf18945422d8bb7..0e6e7dfb49f7eadfeeb152071a9c79b2f889bf91 100644
--- a/src/UpperPHY/CRC/crc_basic.hpp
+++ b/src/UpperPHY/CRC/crc_basic.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index 59460e0998ae1f858ef943a1dd67790f2cdcfe20..47bf69e47039d1a596c204e2e02b50abba1662a1 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,13 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include <arm_neon.h>
 
 static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
-  // Sometimes compilers don't realise that they don't need an extra
+  // Sometimes compilers don't realize that they don't need an extra
   // instruction to extract the 0th lane of a vector, e.g. when doing
   // vmull_p64(a[0], b[0]), so this just gets around that.
   poly128_t res;
@@ -45,8 +45,8 @@ static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
 static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
   // There are two reasons why we can't just use the vaddq_p64 intrinsic:
   // 1. It isn't available on the earliest GCC version we currently support
-  // 2. If GCC recognises that this is an associative operation, then it tries
-  //    to optimise the operation tree in its tree-reassoc pass, but it
+  // 2. If GCC recognizes that this is an associative operation, then it tries
+  //    to optimize the operation tree in its tree-reassoc pass, but it
   //    actually makes the performance much worse. Hiding it in assembly means
   //    that the compiler uses our carefully balanced operation tree instead.
   uint8x16_t res;
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
index 43cd7da4155aa01b21a603ae306570cc3f3e2949..c2c62102c160419c7c6af960f6cd4ab0eb9f013f 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -76,7 +76,7 @@ armral_status tail_biting_convolutional_decode_block(
   }
 
   uint8_t ro_best_i;
-  uint8_t ro_tb_best_i = states; // Initialised with impossible value
+  uint8_t ro_tb_best_i = states; // Initialized with impossible value
 
   uint8_t iter_cnt = 0;
   uint32x4_t preva_init = {0, 2, 4, 6};
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
index c936f704f699c29311d28d34ac540d470cfa21fb..58d57d2a96cf74f10e78df62c8895dd947b11e2d 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
index 469b2529785543953fff4a609b28e74cac44c915..49ea3fde6b7dfdcf50647f96c23c339575eb9f57 100644
--- a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
+++ b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace {
diff --git a/src/UpperPHY/Demodulation/arm_demodulation.c b/src/UpperPHY/Demodulation/arm_demodulation.c
index 2a30828264557f5ac46ec3c8500853ec7432ccf2..238abf02462b5540b8460521f48cf11dfc304ab1 100644
--- a/src/UpperPHY/Demodulation/arm_demodulation.c
+++ b/src/UpperPHY/Demodulation/arm_demodulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -484,7 +484,7 @@ armral_status armral_demodulation(const uint32_t n_symbols, const uint16_t ulp,
                                   armral_modulation_type mod_type,
                                   const armral_cmplx_int16_t *p_src,
                                   int8_t *p_dst) {
-  // If we don't set the return type, it's because the modType isn't recognised.
+  // If we don't set the return type, it's because the modType isn't recognized.
   // Therefore, we have an argument error by default.
   armral_status ret = ARMRAL_ARGUMENT_ERROR;
   switch (mod_type) {
diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp
index 0d4fa9b0b819af397aff9a4637571e92ef353e03..33c45766d0c57c216a229f3578d30f0687d016d9 100644
--- a/src/UpperPHY/LDPC/ldpc_coding.hpp
+++ b/src/UpperPHY/LDPC/ldpc_coding.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/LDPC/ldpc_decoder.cpp b/src/UpperPHY/LDPC/ldpc_decoder.cpp
index 19480003f2fcadfb7c21987e37f6709594ca5b38..ba5297f03a771c911aae3cd4ab50282783dfa22b 100644
--- a/src/UpperPHY/LDPC/ldpc_decoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "bit_utils.hpp"
@@ -1382,7 +1382,7 @@ void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
   size_t new_llrs_size = num_llrs;
   std::optional<unique_ptr<Allocator, int8_t>> maybe_out_llrs;
   if (!z_is_tiny) {
-    // Double the storage required to replicate LLRs for optimisation
+    // Double the storage required to replicate LLRs for optimization
     new_llrs_size *= 2;
     // Extra buffer to pack the LLRs again
     maybe_out_llrs = allocate_uninitialized<int8_t>(allocator, num_llrs);
diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp
index 74a8fe4ba5eb7817982dce0a37adc0f0ff2b2a89..655d7cb3ef06c6e2391fd8feff652d94484b204a 100644
--- a/src/UpperPHY/LDPC/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp
@@ -1,12 +1,16 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
 #include "ldpc_coding.hpp"
 #include "utils/allocators.hpp"
 
+#ifdef ARMRAL_ARCH_SVE
+#include <arm_sve.h>
+#endif
+
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -81,7 +85,7 @@ const uint32_t bg1_columns[] = {
     1,  6,  10, 67                          // row 45: 4
 };
 
-// The shifts are organised by row, and then by index set. Each line in the
+// The shifts are organized by row, and then by index set. Each line in the
 // following represents the shifts in one index set for one block row of the
 // matrix. Indexing into the array works as follows. If we are using index set k
 // for k in [0, 7], and are on block row i, then the indexing function from k, i
@@ -946,6 +950,70 @@ inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
                                           const uint8_t *parity_hdsm,
                                           const uint8_t *agg_parity,
                                           uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    svuint8_t agg0 = svld1_u8(pg, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result24 = sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg1));
+    svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 22 * z, agg0);
+    svst1_u8(pg, data_out + 23 * z, result23);
+    svst1_u8(pg, data_out + 24 * z, result24);
+    svst1_u8(pg, data_out + 25 * z, result25);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+    svuint8_t agg0 = svld1_u8(pg_tail, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result24 =
+        sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg1));
+    svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 22 * z, agg0);
+    svst1_u8(pg_tail, data_out + 23 * z, result23);
+    svst1_u8(pg_tail, data_out + 24 * z, result24);
+    svst1_u8(pg_tail, data_out + 25 * z, result25);
+  }
+
+  // Process the final row
+  {
+    codeword[(23 * z) - 1] = agg_parity[z - 1];
+    codeword[(24 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(25 * z) - 1] =
+        parity_hdsm[3 * z - 1] ^ parity_hdsm[4 * z - 1] ^ agg_parity[0];
+    codeword[(26 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+#else
   uint8_t *data_out = codeword;
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
@@ -1025,11 +1093,189 @@ inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
         parity_hdsm[3 * z - 1] ^ parity_hdsm[4 * z - 1] ^ agg_parity[0];
     codeword[(26 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
                                       const uint8_t *agg_parity,
                                       uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  if (z == 208) {
+    uint8_t *data_out = codeword;
+    const uint8_t *ptr_agg = agg_parity;
+    const uint8_t *ptr_hdsm = parity_hdsm;
+    // zb = 0 to 104
+    int32_t full_vectors = 105 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg103 = svld1_u8(pg, ptr_agg + 103);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg103);
+      svuint8_t result24 =
+          sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg103));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg103);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg103);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = 105 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg103 = svld1_u8(pg_tail, ptr_agg + 103);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg103);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg103));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg103);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg103);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+    // Process  zb = 105 to 207
+    full_vectors = 103 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg105 = svld1_u8(pg, ptr_agg - 105);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg105);
+      svuint8_t result24 =
+          sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg105));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg105);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg105);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    tail_size = 103 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg105 = svld1_u8(pg_tail, ptr_agg - 105);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg105);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg105));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg105);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg105);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+  } else { // z != 208
+
+    // Process the first row of the loop (zb =0)
+    {
+      codeword[22 * z] = agg_parity[z - 1];
+      codeword[23 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+      codeword[24 * z] =
+          parity_hdsm[2 * z] ^ parity_hdsm[3 * z] ^ agg_parity[z - 1];
+      codeword[25 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+    }
+
+    // Process zb = 1 to z
+    uint8_t *data_out = codeword + 1;
+    const uint8_t *ptr_agg = agg_parity + 1;
+    const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+    int32_t full_vectors = (z - 1) / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg1 = svld1_u8(pg, ptr_agg - 1);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg1);
+      svuint8_t result24 = sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg1));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg1);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg1);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg - 1);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg1);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg1));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg1);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+  }
+#else
   if (z == 208) {
     uint8_t *data_out = codeword;
     const uint8_t *ptr_agg = agg_parity;
@@ -1219,12 +1465,77 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
       codeword[(25 * z) + zb] = parity_hdsm[3 * z + zb] ^ agg_parity[zb - 1];
     }
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
                                                 const uint8_t *parity_hdsm,
                                                 const uint8_t *agg_parity,
                                                 uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  // Process the first row of the loop (zb =0)
+  {
+    codeword[10 * z] = agg_parity[z - 1];
+    codeword[11 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+    codeword[12 * z] = parity_hdsm[0] ^ parity_hdsm[z] ^ agg_parity[z - 1];
+    codeword[13 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+  }
+
+  uint8_t *data_out = codeword + 1;
+  const uint8_t *ptr_agg = agg_parity + 1;
+  const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    // Load inputs
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg - 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result12 = sveor_u8_x(pg, hdsm0, sveor_u8_x(pg, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 10 * z, agg1);
+    svst1_u8(pg, data_out + 11 * z, result11);
+    svst1_u8(pg, data_out + 12 * z, result12);
+    svst1_u8(pg, data_out + 13 * z, result13);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+
+    // Load inputs
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg - 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg_tail, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result12 =
+        sveor_u8_x(pg_tail, hdsm0, sveor_u8_x(pg_tail, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 10 * z, agg1);
+    svst1_u8(pg_tail, data_out + 11 * z, result11);
+    svst1_u8(pg_tail, data_out + 12 * z, result12);
+    svst1_u8(pg_tail, data_out + 13 * z, result13);
+  }
+#else
   // Deal with the first row of the loop (zb =0)
   {
     codeword[10 * z] = agg_parity[z - 1];
@@ -1301,12 +1612,81 @@ inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
         parity_hdsm[zb] ^ parity_hdsm[z + zb] ^ agg_parity[zb - 1];
     codeword[(13 * z) + zb] = parity_hdsm[3 * z + zb] ^ agg_parity[zb - 1];
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
                                            const uint8_t *parity_hdsm,
                                            const uint8_t *agg_parity,
                                            uint8_t *codeword) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    // Load inputs
+    svuint8_t agg0 = svld1_u8(pg, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result12 = sveor_u8_x(pg, hdsm0, sveor_u8_x(pg, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 10 * z, agg0);
+    svst1_u8(pg, data_out + 11 * z, result11);
+    svst1_u8(pg, data_out + 12 * z, result12);
+    svst1_u8(pg, data_out + 13 * z, result13);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+
+    // Load inputs
+    svuint8_t agg0 = svld1_u8(pg_tail, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg_tail, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result12 =
+        sveor_u8_x(pg_tail, hdsm0, sveor_u8_x(pg_tail, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 10 * z, agg0);
+    svst1_u8(pg_tail, data_out + 11 * z, result11);
+    svst1_u8(pg_tail, data_out + 12 * z, result12);
+    svst1_u8(pg_tail, data_out + 13 * z, result13);
+  }
+
+  // Process the final row outside of the loop
+  {
+    codeword[(11 * z) - 1] = agg_parity[z - 1];
+    codeword[(12 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(13 * z) - 1] =
+        parity_hdsm[z - 1] ^ parity_hdsm[2 * z - 1] ^ agg_parity[0];
+    codeword[(14 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+#else
   uint8_t *data_out = codeword;
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
@@ -1388,6 +1768,7 @@ inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
         parity_hdsm[z - 1] ^ parity_hdsm[2 * z - 1] ^ agg_parity[0];
     codeword[(14 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
   }
+#endif
 }
 
 // Set parity for base graph 1
@@ -1457,6 +1838,78 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
                                   uint8_t *codeword) {
   auto max_ind = graph->nmessage_bits + 4;
 
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  for (uint32_t i = 4; i < graph->nrows; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then the lifting set index
+    // is added to this
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < max_ind; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = codeword + z * (graph->nmessage_bits + i);
+      auto *codeword_ptr = codeword + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      codeword_ptr = codeword + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j == col_entries - 1);
+  }
+#else
   for (uint32_t i = 4; i < graph->nrows; ++i) {
     auto row_start_ind = graph->row_start_inds[i];
     const auto *col_ptr = graph->col_inds + row_start_ind;
@@ -1548,11 +2001,83 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
     // We should have used every column apart from the last one
     assert(j == col_entries - 1);
   }
+#endif
 }
 
 inline void spmv_hdsm(uint32_t z, uint32_t lsi,
                       const armral_ldpc_base_graph_t *graph, uint8_t *bytes_in,
                       uint8_t *parity_hdsm) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  for (uint32_t i = 0; i < 4; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < graph->nmessage_bits; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = parity_hdsm + z * i;
+      auto *in_ptr = bytes_in + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, in_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      in_ptr = bytes_in + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, in_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j < col_entries && col_ptr[j] >= graph->nmessage_bits);
+  }
+#else
   for (uint32_t i = 0; i < 4; ++i) {
     auto row_start_ind = graph->row_start_inds[i];
     const auto *col_ptr = graph->col_inds + row_start_ind;
@@ -1642,11 +2167,37 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi,
     }
     assert(j < col_entries && col_ptr[j] >= graph->nmessage_bits);
   }
+#endif
 }
 
 inline void copy_input_message(uint32_t z,
                                const armral_ldpc_base_graph_t *graph,
                                const uint8_t *bytes_in, uint8_t *codeword) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+  int32_t full_vectors = z / num_lanes;
+  int32_t tail_size = z - (full_vectors * num_lanes);
+
+  for (uint32_t j = 0; j < graph->nmessage_bits; ++j) {
+    uint8_t *out_ptr = codeword + j * z;
+    const uint8_t *in_ptr = bytes_in + j * z;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      svuint8_t reg = svld1_u8(pg, in_ptr);
+      svst1_u8(pg, out_ptr, reg);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t reg = svld1_u8(pg_tail, in_ptr);
+      svst1_u8(pg_tail, out_ptr, reg);
+    }
+  }
+#else
   for (uint32_t j = 0; j < graph->nmessage_bits; ++j) {
 
     uint8_t *out_ptr = codeword + j * z;
@@ -1679,10 +2230,60 @@ inline void copy_input_message(uint32_t z,
       codeword[j * z + zb] = bytes_in[j * z + zb];
     }
   }
+#endif
 }
 
 inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
                           uint8_t *tmp_parity) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+  int32_t full_vectors = z / num_lanes;
+
+  // First iteration, tmp_parity is vector of 0
+  uint8_t *out_ptr = tmp_parity;
+  const uint8_t *in_ptr = parity_hdsm;
+  svuint8_t reg1 = svdup_n_u8(0);
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    svuint8_t reg2 = svld1_u8(pg, in_ptr);
+    svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+    svst1_u8(pg, out_ptr, result);
+    out_ptr += num_lanes;
+    in_ptr += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = z - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+    svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+    svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+    svst1_u8(pg_tail, out_ptr, result);
+  }
+
+  // Iteration 1 to 3
+  for (uint32_t j = 1; j < 4; ++j) {
+    out_ptr = tmp_parity;
+    in_ptr = parity_hdsm + z * j;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      reg1 = svld1_u8(pg, out_ptr);
+      svuint8_t reg2 = svld1_u8(pg, in_ptr);
+      svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+      svst1_u8(pg, out_ptr, result);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      reg1 = svld1_u8(pg_tail, out_ptr);
+      svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+      svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+      svst1_u8(pg_tail, out_ptr, result);
+    }
+  }
+#else
   // First iteration, tmp_parity is vector of 0
   uint8_t *out_ptr = tmp_parity;
   const uint8_t *in_ptr = parity_hdsm;
@@ -1752,6 +2353,7 @@ inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
       tmp_parity[zb] ^= parity_hdsm[j * z + zb];
     }
   }
+#endif
 }
 
 template<typename Allocator>
diff --git a/src/UpperPHY/LDPC/ldpc_rate_common.hpp b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
index 3858f49fd118fbe0b6ccc8d7548929312aaff247..e4037ea32d1d611f87e677e3efa37c0360caaa30 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_common.hpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
index 2324c2f9eff078dc066d058a0f3136def8080d50..9ab67609e65b06cad1d5bac54320c7f98418e789 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
index 6fa9b6c3acf449d23967d6919d6e4a9eb4a841c0..206e6548253f3357570cde5d7ccbf739e196a5e3 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_rate_common.hpp"
diff --git a/src/UpperPHY/Modulation/arm_modulation.c b/src/UpperPHY/Modulation/arm_modulation.c
index 96c91bb83b6165ec629985a5927a4f9910c915ef..242acb4e9824ae0a456d87ea0432a8240326bb61 100644
--- a/src/UpperPHY/Modulation/arm_modulation.c
+++ b/src/UpperPHY/Modulation/arm_modulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
index 3925063a5e5ef28ac08add6df29a2e7be205a8b1..341b4b0ce481f64c42dfeed5c094c222d2a4f7ab 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_check.cpp b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
index f88507361e96e9f524a81ea0f0fd536d0e698da1..b0682cd948ac92bd52ebf7cdd99ef990c1631e30 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.cpp b/src/UpperPHY/Polar/arm_polar_decoder.cpp
index 4c4cc8d87c137ed0542a2b010229e5972959d62a..b1db62043cf5ac793c5aa18fc823a9e90e15b476 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder.cpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -38,7 +38,7 @@ inline void g_l(const int8_t *in, const uint8_t *dec, const uint8_t *hist,
   // g(a_h, b_h, c_i=1) = a_h - b_h
   // This matches the non-list version, but for L > 1 we need to take care of
   // permuting the input beliefs by the list history value rather than simply
-  // vectorising the beliefs directly.
+  // vectorizing the beliefs directly.
   if constexpr (L > 1) {
     g_l_impl<Nhalf, L>::g_l(in, dec, hist, out);
   } else {
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.hpp b/src/UpperPHY/Polar/arm_polar_decoder.hpp
index ef2091cf7f835d26b0bc90b1c6b86f3219ed5c6d..7989bacdb00d31d5bb72ff79ac1ba569f1bccf62 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder.hpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
index 249f2e0a961c22a1e8a8b8f3874924fa4d7a00e1..fb20d2ebba37b80e82c9bd0147e3a1d9bbc24b98 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
@@ -130,7 +130,7 @@ template<>
 struct g_l_impl<2, 4> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=4).
+    // specialized N=2-byte chunks interleaved (times L=4).
     uint8x8_t h8 = vld_hist_l4(hist);
     uint8x8_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4};
     g_l_x8(in, dec, h8, xs_idx, out);
@@ -141,7 +141,7 @@ template<>
 struct g_l_impl<2, 8> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=8).
+    // specialized N=2-byte chunks interleaved (times L=8).
     uint8x16_t h8 = vld_histq_l8(hist);
     uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
     g_l_x16(in, dec, h8, xs_idx, out);
@@ -162,7 +162,7 @@ template<>
 struct g_l_impl<4, 4> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=4-byte chunks interleaved (times L=4).
+    // specialized N=4-byte chunks interleaved (times L=4).
     uint8x16_t h8 = vld_histq_l4(hist);
     uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
     g_l_x16(in, dec, h8, xs_idx, out);
diff --git a/src/UpperPHY/Polar/arm_polar_encoder.c b/src/UpperPHY/Polar/arm_polar_encoder.c
index cd7412529419448403e25c4de60a9460dd614773..5936f57ea554b58d27281480fcd8a61bec454245 100644
--- a/src/UpperPHY/Polar/arm_polar_encoder.c
+++ b/src/UpperPHY/Polar/arm_polar_encoder.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
index 6887a7447d8a0a4270750c8f22da94a27dc8b3f2..6c5fe66b3f69cc30be39b6e27a96a08f6b0da0e1 100644
--- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -20,7 +20,7 @@ constexpr frozen_arrays<N> fa;
 // reliability sequences for particular polar code sizes, derived from table
 // 5.3.1.2-1 in 3GPP TS 38.212. These are stored in reverse order, so go from
 // most reliable bit to least reliable. This is so that we can iterate forwards
-// through these, which makes vectorisation easier.
+// through these, which makes vectorization easier.
 constexpr uint16_t q32[] = {31, 30, 29, 27, 23, 15, 28, 22, 25, 26, 21,
                             14, 13, 19, 11, 7,  24, 20, 12, 18, 10, 17,
                             6,  9,  5,  3,  16, 8,  4,  2,  1,  0};
diff --git a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
index 2d6ca39c819faa1d99e8315105f95b9574ddf9a9..dbf884cbeba6d0151e81eda10aa2d9f3b8d717d3 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
index d1c6ff562f1cd5495e01474916b07a1923c753ce..06c903ef62a2298fa02480528463f730f1da5213 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
index 8d0a317e33707326446e7f4a0b1b097b381bd2ce..6b1b86a39661a2e70df233e97accf48ea712b284 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
index d450c19abbb918204ea40deb6c61bbe6ed8f79a8..192d339f5e75a95bfa7621fcc5167d3e479b8788 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
index f0935c5d569dd65513705978f04880681c5c98b2..f2415fd4070a5391cdda1c5da688d5f4e7c29eed 100644
--- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -8,546 +8,11 @@
 #include "turbo_tables.hpp"
 #include "utils/allocators.hpp"
 
-#include <cmath>
-#include <cstdlib>
-
-namespace {
-
-// With Turbo codes n (=k) is always divisible by 8 so we
-// do not have to worry about tail bits
-inline void turbo_llrs_to_bits(uint32_t n, const float32x4_t *llr,
-                               uint8_t *data_out) {
-  uint32_t full_bytes = n >> 3;
-  constexpr uint32x4_t ones_0 = {128, 64, 32, 16};
-  constexpr uint32x4_t ones_1 = {8, 4, 2, 1};
-
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    // The first bit to write in the byte is the most significant
-    uint32x4_t pred_0 = vcltzq_f32(llr[i * 2]);
-    uint32x4_t pred_1 = vcltzq_f32(llr[i * 2 + 1]);
-    uint32x4_t mask_0 = vandq_u32(pred_0, ones_0);
-    uint32x4_t mask_1 = vandq_u32(pred_1, ones_1);
-    uint32x4_t mask_2 = vorrq_u32(mask_0, mask_1);
-    data_out[i] = (uint8_t)vaddvq_u32(mask_2);
-  }
-}
-
-// Take the input int8_t LLRs and convert them to float32x4_ts
-inline void convert_llrs(uint32_t k, const int8_t *llrs,
-                         float32x4_t *llrs_f32) {
-  constexpr int8x16_t idx_0 = {127, 127, 127, 0, 127, 127, 127, 1,
-                               127, 127, 127, 2, 127, 127, 127, 3};
-  constexpr int8x16_t idx_1 = {127, 127, 127, 4, 127, 127, 127, 5,
-                               127, 127, 127, 6, 127, 127, 127, 7};
-  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
-  for (uint32_t i = 0, j = 0; i < k; i += 8, j += 2) {
-    int8x8_t data = vld1_s8(&llrs[i]);
-    int32x4_t ldata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_0));
-    int32x4_t hdata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_1));
-    llrs_f32[j] = vcvtq_n_f32_s32(ldata, 24);
-    llrs_f32[j + 1] = vcvtq_n_f32_s32(hdata, 24);
-  }
-}
-
-// Calculate the PDF of the state transition probability on the assumption that
-// we are operating on an AWGN channel:
-//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
-// In our implementation we assume the channel reliability, l_c,
-// has been prescaled by 1/2 to avoid doing so repeatedly here.
-template<int32_t x1, int32_t x2, bool use_extrinsic>
-inline float32x4_t transition_pdf(float32x4_t l_uk, float32x4_t l_c,
-                                  float32x4_t y1, float32x4_t y2) {
-  if constexpr (use_extrinsic) {
-    float32x4_t term1 =
-        vmulq_n_f32(vfmaq_f32(vmulq_n_f32(l_uk, 0.5F), l_c, y1), x1);
-    float32x4_t term2 = vmulq_f32(vmulq_n_f32(l_c, (float32_t)x2), y2);
-    return vaddq_f32(term1, term2);
-  } else {
-    return vmulq_f32(l_c, vaddq_f32(vmulq_n_f32(y1, (float32_t)x1),
-                                    vmulq_n_f32(y2, (float32_t)x2)));
-  }
-}
-
-// Update the extrinsic information output from the decoding stage
-// based on the computed LLRs, the old extrinsic information and the input.
-inline void update_extrinsic(uint32_t len, const float32x4_t *llr,
-                             float32x4_t *extrinsic, const float32x4_t *input) {
-  for (uint32_t i = 0; i < len; i++) {
-    extrinsic[i] = vsubq_f32(vsubq_f32(llr[i], extrinsic[i]), input[i]);
-  }
-}
-
-// Calculate the trellis termination values. These are independent of the
-// extrinsic information and so can be done once without needing to be updated
-// on every iteration.
-void trellis_termination(const float32x4_t *sys, const float32x4_t *par,
-                         uint32_t k4, float32x4_t l_c, float32x4_t *beta_out) {
-  // We handle the gammas for the trellis termination bits separately
-  // as the state transitions are different. The x_{kl} are never 1
-  // here, because we always use inputs of 0 to drive the trellis back
-  // to state 0 in the encoder, so we only need to consider a smaller
-  // number of state transitions. We also do not have any extrinsic
-  // information. Because some of the gamma terms will
-  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
-  // adding to them to beta or taking the max with them, compared with
-  // when we calculate beta in the main calculations.
-  float32x4_t unused_extrinsic = {0};
-  float32x4_t pdf_00 =
-      transition_pdf<1, 1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
-  float32x4_t pdf_01 =
-      transition_pdf<1, -1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
-
-  // We need  b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]}
-  float32x4_t pdf_uzp1 = vuzp1q_f32(pdf_00, pdf_01);
-  float32x4_t b01 = vtrn2q_f32(pdf_uzp1, pdf_uzp1);
-
-  // We need  g01_02 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
-  float32x4_t pdf_uzp2 = vuzp2q_f32(pdf_00, pdf_01);
-  float32x4_t g01_02 = vuzp1q_f32(pdf_uzp2, pdf_uzp2);
-
-  float32x4_t beta_term = vaddq_f32(g01_02, b01);
-
-  // We need  g01_02_1 = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0]};
-  float32x4_t g01_02_1 = vuzp1q_f32(pdf_uzp1, pdf_uzp1);
-
-  // We need  b01_1 = {beta_term[0], beta_term[0], beta_term[1], beta_term[1]};
-  float32x4_t b01_1 = vzip1q_f32(beta_term, beta_term);
-  beta_out[0] = vaddq_f32(g01_02_1, b01_1);
-
-  // We need  g23_02_1 = {pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
-  float32x4_t g23_02_1 = vrev64q_f32(g01_02_1);
-
-  // We need  b23_1 = {beta_term[2], beta_term[2], beta_term[3], beta_term[3]};
-  float32x4_t b23_1 = vzip2q_f32(beta_term, beta_term);
-  beta_out[1] = vaddq_f32(g23_02_1, b23_1);
-}
-
-// A single max-log-MAP decoder that works on an array of systematic bits (sys),
-// an array of parity bits (par), and an array of extrinsic values from a
-// previous decoding stage (extrinsic)
-void decode_step(const float32x4_t *sys, const float32x4_t *par,
-                 const float32x4_t *extrinsic, uint32_t k4, float32x4_t *llr,
-                 float32x4_t *alpha, const float32x4_t *beta_tail,
-                 float32x4x4_t *pdf4, float32x4_t l_c) {
-  uint32_t k_idx;
-  uint32_t kp1_idx;
-
-  constexpr uint8x16_t rev_idx = {12, 13, 14, 15, 8, 9, 10, 11,
-                                  4,  5,  6,  7,  0, 1, 2,  3};
-
-  // Start by computing the non-zero conditional state transition probabilities
-  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
-  // an AWGN channel (ignoring extrinsic information in l_uk):
-  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
-  // Here there are only 2 possible state transitions into each state
-  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
-  // terms.
-  for (uint32_t i = 0; i < k4; i++) {
-    // The x_{kl} values are the actual systematic and parity values that
-    // would result from the encoder having transited from state s' to s.
-    // They can only ever be either 0 or 1 so we precompute the four possible
-    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
-    // that these 0s and 1s have to be converted to 1s and -1s to match the
-    // values in y
-    //
-    // The y_{kl} values are the observed the systematic and parity inputs.
-    // These have potentially been perturbed by noise on the channel
-    //
-    // Although each of the 8 states of the encoder has in theory 8
-    // predecessor states, the encoder's structure means that not all state
-    // transitions are possible. Each state actually only has 2 predecessor
-    // states so we only have to compute 16 non-zero values for each input
-    // LLR.
-    float32x4_t pdf_00 =
-        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_10 =
-        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_01 =
-        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_11 =
-        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
-
-    // There is considerable duplication in the values we could store. For
-    // example, for a single state the 16 gamma values are:
-    //
-    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
-    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
-    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
-    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
-    //
-    // We therefore choose to store the 4 unique pdf values (using st4)
-    // as this allows us to access the pdf values contiguously in the
-    // calculations needed for the alpha and beta values
-    vst4q_f32((float32_t *)&pdf4[i],
-              float32x4x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
-
-    // Accumulate the state transition probabilities forwards through the
-    // state transition trellis starting from the known encoder start state 0
-    for (uint32_t j = 0; j < 4; j++) {
-      k_idx = 8 * i + j * 2;
-      kp1_idx = k_idx + 2;
-
-      // We need  g0 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
-      //                gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0]};
-      //         a02 = {alpha[k_idx][0], alpha[k_idx][2],
-      //                alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
-      float32x4_t g0 = pdf4[i].val[j];
-      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t left_1 = vaddq_f32(g0, a02);
-      // We need  g2 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
-      //                gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
-      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
-      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
-      float32x4_t g2 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), rev_idx));
-      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t right_1 = vaddq_f32(g2, a13);
-      alpha[kp1_idx] = vmaxq_f32(left_1, right_1);
-
-      // We need  g1 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][1],
-      //                gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][1]};
-      // which is g2 above
-      float32x4_t left_2 = vaddq_f32(g2, a02);
-      // We need  g3 = {gamma[g_k_idx][3], gamma[g_k_idx + 1][3],
-      //                gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][3]};
-      // which is g0 above
-      float32x4_t right_2 = vaddq_f32(g0, a13);
-      alpha[kp1_idx + 1] = vmaxq_f32(left_2, right_2);
-    }
-  }
-
-  // Accumulate the state transition probabilities backwards through the state
-  // transition trellis starting from the beginning of the precomputed tail
-  // and calculate the conditional probabilities of each bit being either 0
-  // or 1
-  constexpr uint8x16_t idx_0312 = {0, 1, 2, 3, 12, 13, 14, 15,
-                                   4, 5, 6, 7, 8,  9,  10, 11};
-  constexpr uint8x16_t idx_3021 = {12, 13, 14, 15, 0, 1, 2, 3,
-                                   8,  9,  10, 11, 4, 5, 6, 7};
-  constexpr uint8x16_t idx_2130 = {8,  9,  10, 11, 4, 5, 6, 7,
-                                   12, 13, 14, 15, 0, 1, 2, 3};
-  constexpr uint8x16_t idx_1203 = {4, 5, 6, 7, 8,  9,  10, 11,
-                                   0, 1, 2, 3, 12, 13, 14, 15};
-  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
-                                   8, 9, 10, 11, 0, 1, 2,  3};
-  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
-                                   4,  5,  6,  7,  12, 13, 14, 15};
-
-  float32x4x2_t beta_k;
-  float32x4x2_t beta_kp1 = {beta_tail[0], beta_tail[1]};
-
-  for (int32_t i = k4 - 1; i >= 0; i--) {
-    float32x4_t prob_0;
-    float32x4_t prob_1;
-    for (int32_t j = 3; j >= 0; j--) {
-      k_idx = 8 * i + j * 2;
-
-      // We need  g01_02 = {gamma[g_k_idx][0], gamma[g_k_idx][2],
-      //                    gamma[g_k_idx + 1][0], gamma[g_k_idx + 1][2]};
-      //          b01 = {beta[b_kp1_idx][0], beta[b_kp1_idx][0],
-      //                 beta[b_kp1_idx][1], beta[b_kp1_idx][1]};
-      float32x4_t g01_02 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0312));
-      float32x4_t b01 = vzip1q_f32(beta_kp1.val[0], beta_kp1.val[0]);
-      float32x4_t left_1 = vaddq_f32(g01_02, b01);
-
-      // We need  g13 = {gamma[g_k_idx][1], gamma[g_k_idx][3],
-      //                 gamma[g_k_idx + 1][1], gamma[g_k_idx + 1][3]};
-      //          bp1_01 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx + 1][0],
-      //                    beta[b_kp1_idx + 1][1], beta[b_kp1_idx + 1][1]};
-      float32x4_t g13 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3021));
-      float32x4_t bp1_01 = vzip1q_f32(beta_kp1.val[1], beta_kp1.val[1]);
-      float32x4_t right_1 = vaddq_f32(g13, bp1_01);
-      beta_k.val[0] = vmaxq_f32(left_1, right_1);
-
-      // We need  g23_02 = {gamma[g_k_idx + 2][0], gamma[g_k_idx + 2][2],
-      //                    gamma[g_k_idx + 3][0], gamma[g_k_idx + 3][2]};
-      // We need  b23 = {beta[b_kp1_idx][2], beta[b_kp1_idx][2],
-      //                 beta[b_kp1_idx][3], beta[b_kp1_idx][3]};
-      float32x4_t g23_02 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_2130));
-      float32x4_t b23 = vzip2q_f32(beta_kp1.val[0], beta_kp1.val[0]);
-      float32x4_t left_2 = vaddq_f32(g23_02, b23);
-
-      // We need  g23_13 = {gamma[g_k_idx + 2][1], gamma[g_k_idx + 2][3],
-      //                    gamma[g_k_idx + 3][1], gamma[g_k_idx + 3][3]};
-      //          bp1_23 = {beta[b_kp1_idx + 1][2], beta[b_kp1_idx + 1][2],
-      //                    beta[b_kp1_idx + 1][3], beta[b_kp1_idx + 1][3]};
-      float32x4_t g23_13 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_1203));
-      float32x4_t bp1_23 = vzip2q_f32(beta_kp1.val[1], beta_kp1.val[1]);
-      float32x4_t right_2 = vaddq_f32(g23_13, bp1_23);
-      beta_k.val[1] = vmaxq_f32(left_2, right_2);
-
-      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
-      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
-      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
-      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
-      //       b02_13 = {beta[b_kp1_idx][0], beta[b_kp1_idx + 1][1],
-      //                 beta[b_kp1_idx][2], beta[b_kp1_idx + 1][3]};
-      //       b13_02 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx][1],
-      //                 beta[b_kp1_idx + 1][2], beta[b_kp1_idx][3]};
-      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t b02_13 =
-          vtrn2q_f32(vrev64q_f32(beta_kp1.val[0]), beta_kp1.val[1]);
-      float32x4_t b13_02 =
-          vtrn2q_f32(vrev64q_f32(beta_kp1.val[1]), beta_kp1.val[0]);
-
-      // Find the most probable path in which bit i was a 0
-      // We need  g01_01 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][1],
-      //                   gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][1]};
-      float32x4_t g01_01 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0220));
-      left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
-      right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
-      prob_0[j] = vmaxvq_f32(vmaxq_f32(left_1, right_1));
-
-      // Find the most probable path in which bit i was a 1
-      // We need  g10_10 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][0],
-      //                   gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][0]};
-      float32x4_t g10_10 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3113));
-      left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
-      right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
-      prob_1[j] = vmaxvq_f32(vmaxq_f32(left_2, right_2));
-
-      // Store the current value of beta to use in the next
-      // round of calculations
-      beta_kp1 = beta_k;
-    }
-
-    // Calculate the LLRs
-    llr[i] = vsubq_f32(prob_0, prob_1);
-  }
-}
-
-} // namespace
-
-// The template parameter allows us to disable checking for convergence (and
-// thus terminating the iterations early) so we always run a fixed number of
-// iterations in our benchmarking
-template<bool check_convergence, typename Allocator>
-void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
-                                 const int8_t *itl, uint32_t k, uint8_t *dst,
-                                 float32_t l_c, uint32_t max_iter,
-                                 Allocator &allocator) {
-  // This implements multiple steps of the max-log-MAP algorithm,
-  // which is an approximation to the MAP (BCJR) algorithm.
-  // It returns a hard decision rather than raw LLRs
-
-  // We will be working with float32x4_t, so work out how
-  // many of these will be needed to store k float32_ts.
-  // k is always a multiple of 8, so no need to worry about remainders.
-  uint32_t k4 = k >> 2;
-
-  auto sys_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-  auto par_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-  auto itl_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-
-  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
-  auto perm_sys = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-
-  struct perm_pair {
-    uint16_t first;
-    uint16_t second;
-  };
-
-  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
-
-  // Allocate space to hold the extrinsic and permuted extrinsic information
-  // to be passed between the two decoders. Extrinsic is initially set to 0.
-  auto extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
-  auto perm_extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
-
-  // Allocate space for log likelihood ratios from both stages of decoding
-  auto l1_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
-  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
-  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k4);
-
-  // Allocate space to hold alpha and gamma
-  // alpha stores the forward-accumulated state probabilities for each decoded
-  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
-  // plus the starting condition
-  auto alpha = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
-  // gamma stores the conditional state transition probabilities for each of the
-  // k+3 bits to decode
-  auto gamma = allocate_uninitialized<float32x4x4_t>(allocator, k4);
-
-  // NOTE: All allocations done.
-  if constexpr (Allocator::is_counting) {
-    return;
-  }
-
-  // Convert our LLRs from int8_ts into float32_ts
-  convert_llrs(k, sys, sys_f32.get());
-  convert_llrs(k, par, par_f32.get());
-  convert_llrs(k, itl, itl_f32.get());
-
-  // Unperturb the trellis termination bits. They are transmitted as:
-  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
-  // Z'2
-  // but need to appended to the inputs as:
-  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
-  // Z'2
-  // We append to the systematic (X), the parity (Z) and the interleaved parity
-  // (Z') values here, and to the interleaved systematic values (X') further
-  // down.
-  sys_f32[k4][0] = (float32_t)sys[k];
-  sys_f32[k4][1] = (float32_t)itl[k];
-  sys_f32[k4][2] = (float32_t)par[k + 1];
-
-  par_f32[k4][0] = (float32_t)par[k];
-  par_f32[k4][1] = (float32_t)sys[k + 1];
-  par_f32[k4][2] = (float32_t)itl[k + 1];
-
-  itl_f32[k4][0] = (float32_t)par[k + 2];
-  itl_f32[k4][1] = (float32_t)sys[k + 3];
-  itl_f32[k4][2] = (float32_t)itl[k + 3];
-
-  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
-  const float32x4_t channel_reliability = vdupq_n_f32(l_c / 2);
-
-  // Generate the permutation vector for the input value of k
-  // Find the index into the array of parameter arrays corresponding
-  // to the current k. Subtract 40 because k=40 is the lowest value.
-  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
-  // and extract the correct values of f1 and f2 to build the
-  // interleaving polynomial
-  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
-  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
-  for (uint32_t i = 0; i < k; i++) {
-    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
-  }
-
-  // Create a permuted version of the systematic output for use
-  // with the second decoder
-  for (uint32_t i = 0; i < k4; i++) {
-    perm_sys[i][0] = (float32_t)sys[perm_idx[(i * 4) + 0]];
-    perm_sys[i][1] = (float32_t)sys[perm_idx[(i * 4) + 1]];
-    perm_sys[i][2] = (float32_t)sys[perm_idx[(i * 4) + 2]];
-    perm_sys[i][3] = (float32_t)sys[perm_idx[(i * 4) + 3]];
-  }
-  perm_sys[k4][0] = (float32_t)sys[k + 2];
-  perm_sys[k4][1] = (float32_t)itl[k + 2];
-  perm_sys[k4][2] = (float32_t)par[k + 3];
-
-  // Create a look-up of the permutation vector that maps [0,...k-1] indices
-  // to vector element/vector lane pairs. This avoids having to a modulo
-  // operator every time we want to apply the permutation to vector elements
-  for (uint32_t i = 0; i < k; i++) {
-    uint16_t vec_idx = perm_idx[i] / 4;
-    uint16_t vec_lane = perm_idx[i] % 4;
-    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
-  }
-
-  // Separate arrays to hold the betas of the trellis termination bits for the
-  // original and permuted inputs
-  float32x4_t beta_tail[2];
-  float32x4_t perm_beta_tail[2];
-
-  // Initialise alpha
-  alpha[0] = vdupq_n_f32(-INFINITY);
-  alpha[1] = vdupq_n_f32(-INFINITY);
-  alpha[0][0] = 0;
-
-  // Calculate the trellis termination state transition probabilities, which
-  // do not require any extrinsic information
-  trellis_termination(sys_f32.get(), par_f32.get(), k4, channel_reliability,
-                      beta_tail);
-  trellis_termination(perm_sys.get(), itl_f32.get(), k4, channel_reliability,
-                      perm_beta_tail);
-
-  // Initialise the number of iterations
-  uint32_t num_iter = 0;
-
-  while (num_iter < max_iter) {
-    // Run the first decoder step
-    decode_step(sys_f32.get(), par_f32.get(), extrinsic.get(), k4, l1_uky.get(),
-                alpha.get(), beta_tail, gamma.get(), channel_reliability);
-
-    // Compute the new extrinsic information to pass into the second decoder
-    update_extrinsic(k4, l1_uky.get(), extrinsic.get(), sys_f32.get());
-
-    // Need to unpermute extrinsic to match input to second decoder
-    for (uint32_t i = 0; i < k4; i++) {
-      perm_extrinsic[i][0] =
-          extrinsic[perm_lookup[i * 4].first][perm_lookup[i * 4].second];
-      perm_extrinsic[i][1] = extrinsic[perm_lookup[i * 4 + 1].first]
-                                      [perm_lookup[i * 4 + 1].second];
-      perm_extrinsic[i][2] = extrinsic[perm_lookup[i * 4 + 2].first]
-                                      [perm_lookup[i * 4 + 2].second];
-      perm_extrinsic[i][3] = extrinsic[perm_lookup[i * 4 + 3].first]
-                                      [perm_lookup[i * 4 + 3].second];
-    }
-
-    // Run the second decoder step
-    decode_step(perm_sys.get(), itl_f32.get(), perm_extrinsic.get(), k4,
-                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
-                channel_reliability);
-
-    // Compute the new extrinsic information to pass back into the first encoder
-    update_extrinsic(k4, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
-
-    // But need to unpermute extrinsic first
-    for (uint32_t i = 0; i < k4; i++) {
-      extrinsic[perm_lookup[i * 4].first][perm_lookup[i * 4].second] =
-          perm_extrinsic[i][0];
-      extrinsic[perm_lookup[i * 4 + 1].first][perm_lookup[i * 4 + 1].second] =
-          perm_extrinsic[i][1];
-      extrinsic[perm_lookup[i * 4 + 2].first][perm_lookup[i * 4 + 2].second] =
-          perm_extrinsic[i][2];
-      extrinsic[perm_lookup[i * 4 + 3].first][perm_lookup[i * 4 + 3].second] =
-          perm_extrinsic[i][3];
-    }
-
-    // Compare this iteration's results with those from the previous iteration
-    float32_t max_abs_diff = 0.0;
-    float32_t max_abs_val = 0.0;
-    for (uint32_t i = 0; i < k4; i++) {
-      float32_t abs_diff = vmaxvq_f32(vabdq_f32(l2_uky[i], prev_l2_uky[i]));
-      float32_t abs_val = vmaxvq_f32(vabsq_f32(l2_uky[i]));
-      if (abs_diff > max_abs_diff) {
-        max_abs_diff = abs_diff;
-      }
-      if (abs_val > max_abs_val) {
-        max_abs_val = abs_val;
-      }
-    }
-
-    // If we've converged, finish decoding
-    if constexpr (check_convergence) {
-      if (max_abs_diff / max_abs_val <
-          std::numeric_limits<float32_t>::epsilon()) {
-        break;
-      }
-    }
-
-    // Store the current "final" LLRs to use in convergence checking next
-    // iteration
-    for (uint32_t i = 0; i < k4; i++) {
-      prev_l2_uky[i] = l2_uky[i];
-    }
-
-    num_iter++;
-  }
-
-  // Return unpermuted final output from second encoder
-  // Rather than allocate another new vector, copy into l1_uky and return that
-  for (uint32_t i = 0; i < k4; i++) {
-    l1_uky[perm_lookup[i * 4].first][perm_lookup[i * 4].second] = l2_uky[i][0];
-    l1_uky[perm_lookup[i * 4 + 1].first][perm_lookup[i * 4 + 1].second] =
-        l2_uky[i][1];
-    l1_uky[perm_lookup[i * 4 + 2].first][perm_lookup[i * 4 + 2].second] =
-        l2_uky[i][2];
-    l1_uky[perm_lookup[i * 4 + 3].first][perm_lookup[i * 4 + 3].second] =
-        l2_uky[i][3];
-  }
-
-  // Make a hard decision based on the final LLRs
-  turbo_llrs_to_bits(k, l1_uky.get(), dst);
-}
+#ifdef ARMRAL_ARCH_SVE
+#include "turbo_decoder_fp16.hpp"
+#else
+#include "turbo_decoder_fp32.hpp"
+#endif
 
 template void armral::turbo::decode_block<false, heap_allocator>(
     const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k,
@@ -571,7 +36,7 @@ static armral_status turbo_decode_block(const int8_t *sys, const int8_t *par,
   // itself relative to the channel SNR). For reference see:
   //  N. Wehn, "Turbo-decoding without SNR estimation", IEEE Communications
   //  Letters 4(6), pp. 193-195, July 2000.
-  armral::turbo::decode_block<true>(sys, par, itl, k, dst, 2.0, max_iter,
+  armral::turbo::decode_block<true>(sys, par, itl, k, dst, 2.F, max_iter,
                                     allocator);
   return ARMRAL_SUCCESS;
 }
diff --git a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
index 62acc617b2104e92f997d956df835019d2fea68d..896519eb2e4e6872a6ff5677148e5aa2e892220e 100644
--- a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
index 7048c97e422beda91cad3270a925fdfed46345fd..cd4186901a61e2d2d5820f1e1b547dd7f1a55f1e 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_tables.hpp"
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
index 416cd384995e5058ff87f4f878d9e6f2af1c6ac7..6f00ac761229582658782a142721fcb71c620be6 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/src/UpperPHY/Turbo/turbo_code.hpp b/src/UpperPHY/Turbo/turbo_code.hpp
index f389feddf64622492d9b5342242fd57e1886729d..2dd3ff24369f3e82fdb8543b3360755400a75156 100644
--- a/src/UpperPHY/Turbo/turbo_code.hpp
+++ b/src/UpperPHY/Turbo/turbo_code.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp b/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9244366989a90883d875449b422b953145e16c3
--- /dev/null
+++ b/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp
@@ -0,0 +1,520 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <cmath>
+
+namespace {
+
+struct float16x4x8_t {
+  float16x4_t val[8];
+};
+
+// With Turbo codes n (=k) is always divisible by 8 so we
+// do not have to worry about tail bits
+inline void turbo_llrs_to_bits(uint32_t n, const float16x8_t *llr,
+                               uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  constexpr uint16x8_t ones = {128, 64, 32, 16, 8, 4, 2, 1};
+
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    // The first bit to write in the byte is the most significant
+    uint16x8_t pred = vcltzq_f16(llr[i]);
+    uint16x8_t mask = vandq_u16(pred, ones);
+    data_out[i] = (uint8_t)vaddvq_u16(mask);
+  }
+}
+
+// Take the input int8_t LLRs and convert them to float16x8_ts
+inline void convert_llrs(uint32_t k, const int8_t *llrs,
+                         float16x8_t *llrs_f16) {
+  constexpr int8x16_t idx = {127, 0, 127, 1, 127, 2, 127, 3,
+                             127, 4, 127, 5, 127, 6, 127, 7};
+  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
+  for (uint32_t i = 0, j = 0; i < k; i += 8, j++) {
+    int8x8_t data = vld1_s8(&llrs[i]);
+    int16x8_t data_i16 = vreinterpretq_s16_s8(vtbl1q_s8(data, idx));
+    llrs_f16[j] = vcvtq_n_f16_s16(data_i16, 8);
+  }
+}
+
+// Calculate the PDF of the state transition probability on the assumption that
+// we are operating on an AWGN channel:
+//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
+// In our implementation we assume the channel reliability, l_c,
+// has been prescaled by 1/2 to avoid doing so repeatedly here.
+template<int32_t x1, int32_t x2>
+inline float16x4_t transition_pdf(float16x8_t l_c, float16x8_t y1,
+                                  float16x8_t y2) {
+  return vget_low_f16(
+      vmulq_f16(l_c, vaddq_f16(vmulq_n_f16(y1, (float16_t)x1),
+                               vmulq_n_f16(y2, (float16_t)x2))));
+}
+
+template<int32_t x1, int32_t x2, bool use_extrinsic>
+inline float16x8_t transition_pdf(float16x8_t l_uk, float16x8_t l_c,
+                                  float16x8_t y1, float16x8_t y2) {
+  if constexpr (use_extrinsic) {
+    float16x8_t term1 =
+        vmulq_n_f16(vfmaq_f16(vmulq_n_f16(l_uk, 0.5F), l_c, y1), x1);
+    float16x8_t term2 = vmulq_f16(vmulq_n_f16(l_c, (float16_t)x2), y2);
+    return vaddq_f16(term1, term2);
+  } else {
+    return vmulq_f16(l_c, vaddq_f16(vmulq_n_f16(y1, (float16_t)x1),
+                                    vmulq_n_f16(y2, (float16_t)x2)));
+  }
+}
+
+// Update the extrinsic information output from the decoding stage
+// based on the computed LLRs, the old extrinsic information and the input.
+inline void update_extrinsic(uint32_t len, const float16x8_t *llr,
+                             float16x8_t *extrinsic, const float16x8_t *input) {
+  for (uint32_t i = 0; i < len; i++) {
+    extrinsic[i] = vsubq_f16(vsubq_f16(llr[i], extrinsic[i]), input[i]);
+  }
+}
+
+// Calculate the trellis termination values. These are independent of the
+// extrinsic information and so can be done once without needing to be updated
+// on every iteration.
+float16x8_t trellis_termination(const float16x8_t *sys, const float16x8_t *par,
+                                uint32_t k8, float16x8_t l_c) {
+  // We handle the gammas for the trellis termination bits separately
+  // as the state transitions are different. The x_{kl} are never 1
+  // here, because we always use inputs of 0 to drive the trellis back
+  // to state 0 in the encoder, so we only need to consider a smaller
+  // number of state transitions. We also do not have any extrinsic
+  // information. Because some of the gamma terms will
+  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
+  // adding to them to beta or taking the max with them, compared with
+  // when we calculate beta in the main calculations.
+  float16x4_t pdf_00 = transition_pdf<1, 1>(l_c, sys[k8], par[k8]);
+  float16x4_t pdf_01 = transition_pdf<1, -1>(l_c, sys[k8], par[k8]);
+
+  float16x8_t g0102 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1],
+                       pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
+
+  float16x8_t b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2],
+                     pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]};
+
+  float16x8_t beta_term = vaddq_f16(g0102, b01);
+
+  float16x8_t g = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0],
+                   pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
+
+  float64x2_t beta_term_f64 = vreinterpretq_f64_f16(beta_term);
+  beta_term_f64 = vsetq_lane_f64(beta_term_f64[0], beta_term_f64, 1);
+  float16x8_t b0123 = vzip1q_f16(vreinterpretq_f16_f64(beta_term_f64),
+                                 vreinterpretq_f16_f64(beta_term_f64));
+
+  return vaddq_f16(g, b0123);
+}
+
+// A single max-log-MAP decoder that works on an array of systematic bits (sys),
+// an array of parity bits (par), and an array of extrinsic values from a
+// previous decoding stage (extrinsic)
+void decode_step(const float16x8_t *sys, const float16x8_t *par,
+                 const float16x8_t *extrinsic, uint32_t k8, float16x8_t *llr,
+                 float16x8_t *alpha, float16x8_t beta_tail, float16x4x8_t *pdf4,
+                 float16x8_t l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
+
+  // Start by computing the non-zero conditional state transition probabilities
+  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
+  // an AWGN channel (ignoring extrinsic information in l_uk):
+  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
+  // Here there are only 2 possible state transitions into each state
+  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
+  // terms.
+  for (uint32_t i = 0; i < k8; i++) {
+    // The x_{kl} values are the actual systematic and parity values that
+    // would result from the encoder having transited from state s' to s.
+    // They can only ever be either 0 or 1 so we precompute the four possible
+    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
+    // that these 0s and 1s have to be converted to 1s and -1s to match the
+    // values in y
+    //
+    // The y_{kl} values are the observed systematic and parity inputs.
+    // These have potentially been perturbed by noise on the channel
+    //
+    // Although each of the 8 states of the encoder has in theory 8
+    // predecessor states, the encoder's structure means that not all state
+    // transitions are possible. Each state actually only has 2 predecessor
+    // states so we only have to compute 16 non-zero values for each input
+    // LLR.
+    float16x8_t pdf_00 =
+        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_10 =
+        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_01 =
+        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_11 =
+        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+
+    // There is considerable duplication in the values we could store. For
+    // example, for a single state the 16 gamma values are:
+    //
+    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
+    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
+    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
+    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
+    //
+    // We therefore choose to store the 4 unique pdf values (using st4)
+    // as this allows us to access the pdf values contiguously in the
+    // calculations needed for the alpha and beta values
+    vst4q_f16((float16_t *)&pdf4[i],
+              float16x8x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
+
+    // Accumulate the state transition probabilities forwards through the
+    // state transition trellis starting from the known encoder start state 0
+    for (uint32_t j = 0; j < 8; j++) {
+      k_idx = 8 * i + j;
+      kp1_idx = k_idx + 1;
+
+      float16x4_t fdp = vrev64_f16(pdf4[i].val[j]);
+
+      // We need  g02 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
+      //                 gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0],
+      //                 gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
+      //                 gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
+      float16x8_t g02 = vcombine_f16(pdf4[i].val[j], fdp);
+
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2],
+      //                 alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      float16x8_t a02 = vuzp1q_f16(alpha[k_idx], alpha[k_idx]);
+      float16x8_t left = vaddq_f16(g02, a02);
+
+      // This is g02 with the 64-bit elements swapped
+      float16x8_t g20 = vcombine_f16(fdp, pdf4[i].val[j]);
+
+      // We need  a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3],
+      //                 alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      float16x8_t a13 = vuzp2q_f16(alpha[k_idx], alpha[k_idx]);
+      float16x8_t right = vaddq_f16(g20, a13);
+
+      alpha[kp1_idx] = vmaxq_f16(left, right);
+
+      // Normalize alpha
+      if (j % 4 == 0) {
+        float16x8_t alpha0 = vdupq_n_f16(alpha[kp1_idx][0]);
+        alpha[kp1_idx] = vsubq_f16(alpha[kp1_idx], alpha0);
+      }
+    }
+  }
+
+  // Accumulate the state transition probabilities backwards through the state
+  // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1
+
+  constexpr uint8x16_t idx_even_odd = {0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15};
+
+  constexpr uint8x16_t idx_05274163 = {0, 1, 10, 11, 4,  5,  14, 15,
+                                       8, 9, 2,  3,  12, 13, 6,  7};
+
+  constexpr uint8x16_t idx_0220 = {0, 1, 4, 5, 4, 5, 0, 1,
+                                   0, 1, 4, 5, 4, 5, 0, 1};
+
+  constexpr uint8x16_t idx_3113 = {6, 7, 2, 3, 2, 3, 6, 7,
+                                   6, 7, 2, 3, 2, 3, 6, 7};
+
+  constexpr uint8x16_t idx_0213 = {0, 1, 6, 7, 2, 3, 4, 5,
+                                   4, 5, 2, 3, 6, 7, 0, 1};
+
+  constexpr uint8x16_t idx_1302 = {6, 7, 0, 1, 4, 5, 2, 3,
+                                   2, 3, 4, 5, 0, 1, 6, 7};
+
+  float16x8_t beta_kp1 = beta_tail;
+
+  for (int32_t i = k8 - 1; i >= 0; i--) {
+    float16x8_t prob_0;
+    float16x8_t prob_1;
+
+    for (int32_t j = 7; j >= 0; j--) {
+      k_idx = 8 * i + j;
+
+      // Normalize beta
+      if (j % 4 == 0) {
+        float16x8_t beta0 = vdupq_n_f16(beta_kp1[0]);
+        beta_kp1 = vsubq_f16(beta_kp1, beta0);
+      }
+
+      uint8x16_t pdf8_u8 =
+          vreinterpretq_u8_f16(vcombine_f16(pdf4[i].val[j], pdf4[i].val[j]));
+
+      // g0213 = {pdf[0], pdf[3], pdf[1], pdf[2],
+      //          pdf[2], pdf[1], pdf[3], pdf[0]};
+      float16x8_t g0213 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_0213));
+
+      // Reverse 32-bit elements in g0213
+      // g1302 = {pdf[3], pdf[0], pdf[2], pdf[1],
+      //          pdf[1], pdf[2], pdf[0], pdf[3]};
+      float16x8_t g1302 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_1302));
+
+      // b0123 = {beta_kp1[0], beta_kp1[0], beta_kp1[1], beta_kp1[1],
+      //          beta_kp1[2], beta_kp1[2], beta_kp1[3], beta_kp1[3]};
+      // b4567 = {beta_kp1[4], beta_kp1[4], beta_kp1[5], beta_kp1[5],
+      //          beta_kp1[6], beta_kp1[6], beta_kp1[7], beta_kp1[7]};
+      float16x8_t b0123 = vzip1q_f16(beta_kp1, beta_kp1);
+      float16x8_t b4567 = vzip2q_f16(beta_kp1, beta_kp1);
+
+      float16x8_t left = vaddq_f16(g0213, b0123);
+      float16x8_t right = vaddq_f16(g1302, b4567);
+
+      float16x8_t beta_k = vmaxq_f16(left, right);
+
+      // a0213 = {alpha[k_idx][0], alpha[k_idx][2], alpha[k_idx][4], alpha[k_idx][6],
+      //          alpha[k_idx][1], alpha[k_idx][3], alpha[k_idx][5], alpha[k_idx][7]};
+      float16x8_t a0213 = vreinterpretq_f16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f16(alpha[k_idx]), idx_even_odd));
+
+      // b0213_1302 = {beta_kp1[0], beta_kp1[5], beta_kp1[2], beta_kp1[7],
+      //               beta_kp1[4], beta_kp1[1], beta_kp1[6], beta_kp1[3]};
+      float16x8_t b0213_1302 = vreinterpretq_f16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f16(beta_kp1), idx_05274163));
+      float16x8_t b1302_0213 = vextq_f16(b0213_1302, b0213_1302, 4);
+
+      // g0101 = {pdf[0], pdf[2], pdf[2], pdf[0]};
+      float16x8_t g0101 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_0220));
+
+      float16x8_t left_right_0 = vaddq_f16(vaddq_f16(a0213, b0213_1302), g0101);
+      float16x4_t left_0 = vget_low_f16(left_right_0);
+      float16x4_t right_0 = vget_high_f16(left_right_0);
+
+      // g1010 = {pdf[3], pdf[1], pdf[1], pdf[3]};
+      float16x8_t g1010 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_3113));
+
+      float16x8_t left_right_1 = vaddq_f16(vaddq_f16(a0213, b1302_0213), g1010);
+
+      float16x4_t left_1 = vget_low_f16(left_right_1);
+      float16x4_t right_1 = vget_high_f16(left_right_1);
+
+      prob_0[j] = vmaxv_f16(vmax_f16(left_0, right_0));
+      prob_1[j] = vmaxv_f16(vmax_f16(left_1, right_1));
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
+    }
+
+    // Calculate the LLRs
+    llr[i] = vsubq_f16(prob_0, prob_1);
+  }
+}
+
+} // namespace
+
+// The template parameter allows us to disable checking for convergence (and
+// thus terminating the iterations early) so we always run a fixed number of
+// iterations in our benchmarking
+template<bool check_convergence, typename Allocator>
+void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
+                                 const int8_t *itl, uint32_t k, uint8_t *dst,
+                                 float32_t l_c, uint32_t max_iter,
+                                 Allocator &allocator) {
+  // This implements multiple steps of the max-log-MAP algorithm,
+  // which is an approximation to the MAP (BCJR) algorithm.
+  // It returns a hard decision rather than raw LLRs
+
+  // We will be working with float16x8_t, so work out how
+  // many of these will be needed to store k float16_ts.
+  // k is always a multiple of 8, so no need to worry about remainders.
+  uint32_t k8 = k >> 3;
+
+  auto sys_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+  auto par_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+  auto itl_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+
+  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
+  auto perm_sys = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+
+  struct perm_pair {
+    uint16_t first;
+    uint16_t second;
+  };
+
+  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
+
+  // Allocate space to hold the extrinsic and permuted extrinsic information
+  // to be passed between the two decoders. Extrinsic is initially set to 0.
+  auto extrinsic = allocate_zeroed<float16x8_t>(allocator, k8);
+  auto perm_extrinsic = allocate_zeroed<float16x8_t>(allocator, k8);
+
+  // Allocate space for log likelihood ratios from both stages of decoding
+  auto l1_uky = allocate_uninitialized<float16x8_t>(allocator, k8);
+  auto l2_uky = allocate_uninitialized<float16x8_t>(allocator, k8);
+  auto prev_l2_uky = allocate_zeroed<float16x8_t>(allocator, k8);
+
+  // Allocate space to hold alpha and gamma
+  // alpha stores the forward-accumulated state probabilities for each decoded
+  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
+  // plus the starting condition
+  auto alpha = allocate_uninitialized<float16x8_t>(allocator, 8 * k8 + 1);
+
+  // gamma stores the conditional state transition probabilities for each of the
+  // k+3 bits to decode
+  auto gamma = allocate_uninitialized<float16x4x8_t>(allocator, k8);
+
+  // NOTE: All allocations done.
+  if constexpr (Allocator::is_counting) {
+    return;
+  }
+
+  // Convert our LLRs from int8_ts into float16_ts
+  convert_llrs(k, sys, sys_f16.get());
+  convert_llrs(k, par, par_f16.get());
+  convert_llrs(k, itl, itl_f16.get());
+
+  // Unperturb the trellis termination bits. They are transmitted as:
+  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
+  // Z'2
+  // but need to appended to the inputs as:
+  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
+  // Z'2
+  // We append to the systematic (X), the parity (Z) and the interleaved parity
+  // (Z') values here, and to the interleaved systematic values (X') further
+  // down.
+  sys_f16[k8][0] = (float16_t)sys[k];
+  sys_f16[k8][1] = (float16_t)itl[k];
+  sys_f16[k8][2] = (float16_t)par[k + 1];
+
+  par_f16[k8][0] = (float16_t)par[k];
+  par_f16[k8][1] = (float16_t)sys[k + 1];
+  par_f16[k8][2] = (float16_t)itl[k + 1];
+
+  itl_f16[k8][0] = (float16_t)par[k + 2];
+  itl_f16[k8][1] = (float16_t)sys[k + 3];
+  itl_f16[k8][2] = (float16_t)itl[k + 3];
+
+  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
+  const float16x8_t channel_reliability = vdupq_n_f16((float16_t)l_c / 2);
+
+  // Generate the permutation vector for the input value of k
+  // Find the index into the array of parameter arrays corresponding
+  // to the current k. Subtract 40 because k=40 is the lowest value.
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
+  // and extract the correct values of f1 and f2 to build the
+  // interleaving polynomial
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
+  for (uint32_t i = 0; i < k; i++) {
+    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
+  }
+
+  // Create a permuted version of the systematic output for use
+  // with the second decoder
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      perm_sys[i][j] = (float16_t)sys[perm_idx[(i * 8) + j]];
+    }
+  }
+  perm_sys[k8][0] = (float16_t)sys[k + 2];
+  perm_sys[k8][1] = (float16_t)itl[k + 2];
+  perm_sys[k8][2] = (float16_t)par[k + 3];
+
+  // Create a look-up of the permutation vector that maps [0,...k-1] indices
+  // to vector element/vector lane pairs. This avoids having to a modulo
+  // operator every time we want to apply the permutation to vector elements
+  for (uint32_t i = 0; i < k; i++) {
+    uint16_t vec_idx = perm_idx[i] / 8;
+    uint16_t vec_lane = perm_idx[i] % 8;
+    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
+  }
+
+  // Initialize alpha
+  alpha[0] = vdupq_n_f16(-INFINITY);
+  alpha[0][0] = 0;
+
+  // Calculate the trellis termination state transition probabilities, which
+  // do not require any extrinsic information
+  float16x8_t beta_tail = trellis_termination(sys_f16.get(), par_f16.get(), k8,
+                                              channel_reliability);
+  float16x8_t perm_beta_tail = trellis_termination(
+      perm_sys.get(), itl_f16.get(), k8, channel_reliability);
+
+  // Initialize the number of iterations
+  uint32_t num_iter = 0;
+
+  while (num_iter < max_iter) {
+    // Run the first decoder step
+    decode_step(sys_f16.get(), par_f16.get(), extrinsic.get(), k8, l1_uky.get(),
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
+
+    // Compute the new extrinsic information to pass into the second decoder
+    update_extrinsic(k8, l1_uky.get(), extrinsic.get(), sys_f16.get());
+
+    // Need to unpermute extrinsic to match input to second decoder
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 8 + j].first]
+                                        [perm_lookup[i * 8 + j].second];
+      }
+    }
+
+    // Run the second decoder step
+    decode_step(perm_sys.get(), itl_f16.get(), perm_extrinsic.get(), k8,
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
+
+    // Compute the new extrinsic information to pass back into the first encoder
+    update_extrinsic(k8, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
+
+    // But need to unpermute extrinsic first
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        extrinsic[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] =
+            perm_extrinsic[i][j];
+      }
+    }
+
+    // Compare this iteration's results with those from the previous iteration
+    float16_t max_abs_diff = 0.0;
+    float16_t max_abs_val = 0.0;
+    for (uint32_t i = 0; i < k8; i++) {
+      float16_t abs_diff = vmaxvq_f16(vabdq_f16(l2_uky[i], prev_l2_uky[i]));
+      float16_t abs_val = vmaxvq_f16(vabsq_f16(l2_uky[i]));
+      if (abs_diff > max_abs_diff) {
+        max_abs_diff = abs_diff;
+      }
+      if (abs_val > max_abs_val) {
+        max_abs_val = abs_val;
+      }
+    }
+
+    // If we've converged, finish decoding
+    if constexpr (check_convergence) {
+      if (max_abs_diff / max_abs_val <
+          std::numeric_limits<float16_t>::epsilon()) {
+        break;
+      }
+    }
+
+    // Store the current "final" LLRs to use in convergence checking next
+    // iteration
+    for (uint32_t i = 0; i < k8; i++) {
+      prev_l2_uky[i] = l2_uky[i];
+    }
+
+    num_iter++;
+  }
+
+  // Return unpermuted final output from second encoder
+  // Rather than allocate another new vector, copy into l1_uky and return that
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      l1_uky[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] =
+          l2_uky[i][j];
+    }
+  }
+
+  // Make a hard decision based on the final LLRs
+  turbo_llrs_to_bits(k, l1_uky.get(), dst);
+}
diff --git a/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp b/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fdb6798d0b6b798c14f12b5fd9b286255b108a5
--- /dev/null
+++ b/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp
@@ -0,0 +1,533 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <cmath>
+
+namespace {
+
+// With Turbo codes n (=k) is always divisible by 8 so we
+// do not have to worry about tail bits
+inline void turbo_llrs_to_bits(uint32_t n, const float32x4_t *llr,
+                               uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  constexpr uint32x4_t ones_0 = {128, 64, 32, 16};
+  constexpr uint32x4_t ones_1 = {8, 4, 2, 1};
+
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    // The first bit to write in the byte is the most significant
+    uint32x4_t pred_0 = vcltzq_f32(llr[i * 2]);
+    uint32x4_t pred_1 = vcltzq_f32(llr[i * 2 + 1]);
+    uint32x4_t mask_0 = vandq_u32(pred_0, ones_0);
+    uint32x4_t mask_1 = vandq_u32(pred_1, ones_1);
+    uint32x4_t mask_2 = vorrq_u32(mask_0, mask_1);
+    data_out[i] = (uint8_t)vaddvq_u32(mask_2);
+  }
+}
+
+// Take the input int8_t LLRs and convert them to float32x4_ts
+inline void convert_llrs(uint32_t k, const int8_t *llrs,
+                         float32x4_t *llrs_f32) {
+  constexpr int8x16_t idx_0 = {127, 127, 127, 0, 127, 127, 127, 1,
+                               127, 127, 127, 2, 127, 127, 127, 3};
+  constexpr int8x16_t idx_1 = {127, 127, 127, 4, 127, 127, 127, 5,
+                               127, 127, 127, 6, 127, 127, 127, 7};
+  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
+  for (uint32_t i = 0, j = 0; i < k; i += 8, j += 2) {
+    int8x8_t data = vld1_s8(&llrs[i]);
+    int32x4_t ldata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_0));
+    int32x4_t hdata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_1));
+    llrs_f32[j] = vcvtq_n_f32_s32(ldata, 24);
+    llrs_f32[j + 1] = vcvtq_n_f32_s32(hdata, 24);
+  }
+}
+
+// Calculate the PDF of the state transition probability on the assumption that
+// we are operating on an AWGN channel:
+//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
+// In our implementation we assume the channel reliability, l_c,
+// has been prescaled by 1/2 to avoid doing so repeatedly here.
+template<int32_t x1, int32_t x2, bool use_extrinsic>
+inline float32x4_t transition_pdf(float32x4_t l_uk, float32x4_t l_c,
+                                  float32x4_t y1, float32x4_t y2) {
+  if constexpr (use_extrinsic) {
+    float32x4_t term1 =
+        vmulq_n_f32(vfmaq_f32(vmulq_n_f32(l_uk, 0.5F), l_c, y1), x1);
+    float32x4_t term2 = vmulq_f32(vmulq_n_f32(l_c, (float32_t)x2), y2);
+    return vaddq_f32(term1, term2);
+  } else {
+    return vmulq_f32(l_c, vaddq_f32(vmulq_n_f32(y1, (float32_t)x1),
+                                    vmulq_n_f32(y2, (float32_t)x2)));
+  }
+}
+
+// Update the extrinsic information output from the decoding stage
+// based on the computed LLRs, the old extrinsic information and the input.
+inline void update_extrinsic(uint32_t len, const float32x4_t *llr,
+                             float32x4_t *extrinsic, const float32x4_t *input) {
+  for (uint32_t i = 0; i < len; i++) {
+    extrinsic[i] = vsubq_f32(vsubq_f32(llr[i], extrinsic[i]), input[i]);
+  }
+}
+
+// Calculate the trellis termination values. These are independent of the
+// extrinsic information and so can be done once without needing to be updated
+// on every iteration.
+void trellis_termination(const float32x4_t *sys, const float32x4_t *par,
+                         uint32_t k4, float32x4_t l_c, float32x4_t *beta_out) {
+  // We handle the gammas for the trellis termination bits separately
+  // as the state transitions are different. The x_{kl} are never 1
+  // here, because we always use inputs of 0 to drive the trellis back
+  // to state 0 in the encoder, so we only need to consider a smaller
+  // number of state transitions. We also do not have any extrinsic
+  // information. Because some of the gamma terms will
+  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
+  // adding to them to beta or taking the max with them, compared with
+  // when we calculate beta in the main calculations.
+  float32x4_t unused_extrinsic = {0};
+  float32x4_t pdf_00 =
+      transition_pdf<1, 1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
+  float32x4_t pdf_01 =
+      transition_pdf<1, -1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
+
+  // We need  b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]}
+  float32x4_t pdf_uzp1 = vuzp1q_f32(pdf_00, pdf_01);
+  float32x4_t b01 = vtrn2q_f32(pdf_uzp1, pdf_uzp1);
+
+  // We need  g01_02 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
+  float32x4_t pdf_uzp2 = vuzp2q_f32(pdf_00, pdf_01);
+  float32x4_t g01_02 = vuzp1q_f32(pdf_uzp2, pdf_uzp2);
+
+  float32x4_t beta_term = vaddq_f32(g01_02, b01);
+
+  // We need  g01_02_1 = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0]};
+  float32x4_t g01_02_1 = vuzp1q_f32(pdf_uzp1, pdf_uzp1);
+
+  // We need  b01_1 = {beta_term[0], beta_term[0], beta_term[1], beta_term[1]};
+  float32x4_t b01_1 = vzip1q_f32(beta_term, beta_term);
+  beta_out[0] = vaddq_f32(g01_02_1, b01_1);
+
+  // We need  g23_02_1 = {pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
+  float32x4_t g23_02_1 = vrev64q_f32(g01_02_1);
+
+  // We need  b23_1 = {beta_term[2], beta_term[2], beta_term[3], beta_term[3]};
+  float32x4_t b23_1 = vzip2q_f32(beta_term, beta_term);
+  beta_out[1] = vaddq_f32(g23_02_1, b23_1);
+}
+
+// A single max-log-MAP decoder that works on an array of systematic bits (sys),
+// an array of parity bits (par), and an array of extrinsic values from a
+// previous decoding stage (extrinsic)
+void decode_step(const float32x4_t *sys, const float32x4_t *par,
+                 const float32x4_t *extrinsic, uint32_t k4, float32x4_t *llr,
+                 float32x4_t *alpha, const float32x4_t *beta_tail,
+                 float32x4x4_t *pdf4, float32x4_t l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
+
+  constexpr uint8x16_t rev_idx = {12, 13, 14, 15, 8, 9, 10, 11,
+                                  4,  5,  6,  7,  0, 1, 2,  3};
+
+  // Start by computing the non-zero conditional state transition probabilities
+  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
+  // an AWGN channel (ignoring extrinsic information in l_uk):
+  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
+  // Here there are only 2 possible state transitions into each state
+  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
+  // terms.
+  for (uint32_t i = 0; i < k4; i++) {
+    // The x_{kl} values are the actual systematic and parity values that
+    // would result from the encoder having transited from state s' to s.
+    // They can only ever be either 0 or 1 so we precompute the four possible
+    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
+    // that these 0s and 1s have to be converted to 1s and -1s to match the
+    // values in y
+    //
+    // The y_{kl} values are the observed systematic and parity inputs.
+    // These have potentially been perturbed by noise on the channel
+    //
+    // Although each of the 8 states of the encoder has in theory 8
+    // predecessor states, the encoder's structure means that not all state
+    // transitions are possible. Each state actually only has 2 predecessor
+    // states so we only have to compute 16 non-zero values for each input
+    // LLR.
+    float32x4_t pdf_00 =
+        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_10 =
+        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_01 =
+        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_11 =
+        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+
+    // There is considerable duplication in the values we could store. For
+    // example, for a single state the 16 gamma values are:
+    //
+    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
+    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
+    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
+    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
+    //
+    // We therefore choose to store the 4 unique pdf values (using st4)
+    // as this allows us to access the pdf values contiguously in the
+    // calculations needed for the alpha and beta values
+    vst4q_f32((float32_t *)&pdf4[i],
+              float32x4x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
+
+    // Accumulate the state transition probabilities forwards through the
+    // state transition trellis starting from the known encoder start state 0
+    for (uint32_t j = 0; j < 4; j++) {
+      k_idx = 8 * i + j * 2;
+      kp1_idx = k_idx + 2;
+
+      // We need  g0 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
+      //                gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0]};
+      //         a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      float32x4_t g0 = pdf4[i].val[j];
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t left_1 = vaddq_f32(g0, a02);
+      // We need  g2 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
+      //                gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      float32x4_t g2 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), rev_idx));
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t right_1 = vaddq_f32(g2, a13);
+      alpha[kp1_idx] = vmaxq_f32(left_1, right_1);
+
+      // We need  g1 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][1],
+      //                gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][1]};
+      // which is g2 above
+      float32x4_t left_2 = vaddq_f32(g2, a02);
+      // We need  g3 = {gamma[g_k_idx][3], gamma[g_k_idx + 1][3],
+      //                gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][3]};
+      // which is g0 above
+      float32x4_t right_2 = vaddq_f32(g0, a13);
+      alpha[kp1_idx + 1] = vmaxq_f32(left_2, right_2);
+    }
+  }
+
+  // Accumulate the state transition probabilities backwards through the state
+  // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1
+  constexpr uint8x16_t idx_0312 = {0, 1, 2, 3, 12, 13, 14, 15,
+                                   4, 5, 6, 7, 8,  9,  10, 11};
+  constexpr uint8x16_t idx_3021 = {12, 13, 14, 15, 0, 1, 2, 3,
+                                   8,  9,  10, 11, 4, 5, 6, 7};
+  constexpr uint8x16_t idx_2130 = {8,  9,  10, 11, 4, 5, 6, 7,
+                                   12, 13, 14, 15, 0, 1, 2, 3};
+  constexpr uint8x16_t idx_1203 = {4, 5, 6, 7, 8,  9,  10, 11,
+                                   0, 1, 2, 3, 12, 13, 14, 15};
+  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
+                                   8, 9, 10, 11, 0, 1, 2,  3};
+  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
+                                   4,  5,  6,  7,  12, 13, 14, 15};
+
+  float32x4x2_t beta_k;
+  float32x4x2_t beta_kp1 = {beta_tail[0], beta_tail[1]};
+
+  for (int32_t i = k4 - 1; i >= 0; i--) {
+    float32x4_t prob_0;
+    float32x4_t prob_1;
+    for (int32_t j = 3; j >= 0; j--) {
+      k_idx = 8 * i + j * 2;
+
+      // We need  g01_02 = {gamma[g_k_idx][0], gamma[g_k_idx][2],
+      //                    gamma[g_k_idx + 1][0], gamma[g_k_idx + 1][2]};
+      //          b01 = {beta[b_kp1_idx][0], beta[b_kp1_idx][0],
+      //                 beta[b_kp1_idx][1], beta[b_kp1_idx][1]};
+      float32x4_t g01_02 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0312));
+      float32x4_t b01 = vzip1q_f32(beta_kp1.val[0], beta_kp1.val[0]);
+      float32x4_t left_1 = vaddq_f32(g01_02, b01);
+
+      // We need  g13 = {gamma[g_k_idx][1], gamma[g_k_idx][3],
+      //                 gamma[g_k_idx + 1][1], gamma[g_k_idx + 1][3]};
+      //          bp1_01 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx + 1][0],
+      //                    beta[b_kp1_idx + 1][1], beta[b_kp1_idx + 1][1]};
+      float32x4_t g13 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3021));
+      float32x4_t bp1_01 = vzip1q_f32(beta_kp1.val[1], beta_kp1.val[1]);
+      float32x4_t right_1 = vaddq_f32(g13, bp1_01);
+      beta_k.val[0] = vmaxq_f32(left_1, right_1);
+
+      // We need  g23_02 = {gamma[g_k_idx + 2][0], gamma[g_k_idx + 2][2],
+      //                    gamma[g_k_idx + 3][0], gamma[g_k_idx + 3][2]};
+      // We need  b23 = {beta[b_kp1_idx][2], beta[b_kp1_idx][2],
+      //                 beta[b_kp1_idx][3], beta[b_kp1_idx][3]};
+      float32x4_t g23_02 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_2130));
+      float32x4_t b23 = vzip2q_f32(beta_kp1.val[0], beta_kp1.val[0]);
+      float32x4_t left_2 = vaddq_f32(g23_02, b23);
+
+      // We need  g23_13 = {gamma[g_k_idx + 2][1], gamma[g_k_idx + 2][3],
+      //                    gamma[g_k_idx + 3][1], gamma[g_k_idx + 3][3]};
+      //          bp1_23 = {beta[b_kp1_idx + 1][2], beta[b_kp1_idx + 1][2],
+      //                    beta[b_kp1_idx + 1][3], beta[b_kp1_idx + 1][3]};
+      float32x4_t g23_13 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_1203));
+      float32x4_t bp1_23 = vzip2q_f32(beta_kp1.val[1], beta_kp1.val[1]);
+      float32x4_t right_2 = vaddq_f32(g23_13, bp1_23);
+      beta_k.val[1] = vmaxq_f32(left_2, right_2);
+
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      //       b02_13 = {beta[b_kp1_idx][0], beta[b_kp1_idx + 1][1],
+      //                 beta[b_kp1_idx][2], beta[b_kp1_idx + 1][3]};
+      //       b13_02 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx][1],
+      //                 beta[b_kp1_idx + 1][2], beta[b_kp1_idx][3]};
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t b02_13 =
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[0]), beta_kp1.val[1]);
+      float32x4_t b13_02 =
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[1]), beta_kp1.val[0]);
+
+      // Find the most probable path in which bit i was a 0
+      // We need  g01_01 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][1],
+      //                   gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][1]};
+      float32x4_t g01_01 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0220));
+      left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
+      right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
+      prob_0[j] = vmaxvq_f32(vmaxq_f32(left_1, right_1));
+
+      // Find the most probable path in which bit i was a 1
+      // We need  g10_10 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][0],
+      //                   gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][0]};
+      float32x4_t g10_10 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3113));
+      left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
+      right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
+      prob_1[j] = vmaxvq_f32(vmaxq_f32(left_2, right_2));
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
+    }
+
+    // Calculate the LLRs
+    llr[i] = vsubq_f32(prob_0, prob_1);
+  }
+}
+
+} // namespace
+
+// The template parameter allows us to disable checking for convergence (and
+// thus terminating the iterations early) so we always run a fixed number of
+// iterations in our benchmarking
+template<bool check_convergence, typename Allocator>
+void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
+                                 const int8_t *itl, uint32_t k, uint8_t *dst,
+                                 float32_t l_c, uint32_t max_iter,
+                                 Allocator &allocator) {
+  // This implements multiple steps of the max-log-MAP algorithm,
+  // which is an approximation to the MAP (BCJR) algorithm.
+  // It returns a hard decision rather than raw LLRs
+
+  // We will be working with float32x4_t, so work out how
+  // many of these will be needed to store k float32_ts.
+  // k is always a multiple of 8, so no need to worry about remainders.
+  uint32_t k4 = k >> 2;
+
+  auto sys_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+  auto par_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+  auto itl_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+
+  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
+  auto perm_sys = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+
+  struct perm_pair {
+    uint16_t first;
+    uint16_t second;
+  };
+
+  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
+
+  // Allocate space to hold the extrinsic and permuted extrinsic information
+  // to be passed between the two decoders. Extrinsic is initially set to 0.
+  auto extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
+  auto perm_extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
+
+  // Allocate space for log likelihood ratios from both stages of decoding
+  auto l1_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
+  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
+  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k4);
+
+  // Allocate space to hold alpha and gamma
+  // alpha stores the forward-accumulated state probabilities for each decoded
+  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
+  // plus the starting condition
+  auto alpha = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
+  // gamma stores the conditional state transition probabilities for each of the
+  // k+3 bits to decode
+  auto gamma = allocate_uninitialized<float32x4x4_t>(allocator, k4);
+
+  // NOTE: All allocations done.
+  if constexpr (Allocator::is_counting) {
+    return;
+  }
+
+  // Convert our LLRs from int8_ts into float32_ts
+  convert_llrs(k, sys, sys_f32.get());
+  convert_llrs(k, par, par_f32.get());
+  convert_llrs(k, itl, itl_f32.get());
+
+  // Unperturb the trellis termination bits. They are transmitted as:
+  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
+  // Z'2
+  // but need to appended to the inputs as:
+  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
+  // Z'2
+  // We append to the systematic (X), the parity (Z) and the interleaved parity
+  // (Z') values here, and to the interleaved systematic values (X') further
+  // down.
+  sys_f32[k4][0] = (float32_t)sys[k];
+  sys_f32[k4][1] = (float32_t)itl[k];
+  sys_f32[k4][2] = (float32_t)par[k + 1];
+
+  par_f32[k4][0] = (float32_t)par[k];
+  par_f32[k4][1] = (float32_t)sys[k + 1];
+  par_f32[k4][2] = (float32_t)itl[k + 1];
+
+  itl_f32[k4][0] = (float32_t)par[k + 2];
+  itl_f32[k4][1] = (float32_t)sys[k + 3];
+  itl_f32[k4][2] = (float32_t)itl[k + 3];
+
+  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
+  const float32x4_t channel_reliability = vdupq_n_f32(l_c / 2);
+
+  // Generate the permutation vector for the input value of k
+  // Find the index into the array of parameter arrays corresponding
+  // to the current k. Subtract 40 because k=40 is the lowest value.
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
+  // and extract the correct values of f1 and f2 to build the
+  // interleaving polynomial
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
+  for (uint32_t i = 0; i < k; i++) {
+    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
+  }
+
+  // Create a permuted version of the systematic output for use
+  // with the second decoder
+  for (uint32_t i = 0; i < k4; i++) {
+    for (uint32_t j = 0; j < 4; j++) {
+      perm_sys[i][j] = (float32_t)sys[perm_idx[(i * 4) + j]];
+    }
+  }
+  perm_sys[k4][0] = (float32_t)sys[k + 2];
+  perm_sys[k4][1] = (float32_t)itl[k + 2];
+  perm_sys[k4][2] = (float32_t)par[k + 3];
+
+  // Create a look-up of the permutation vector that maps [0,...k-1] indices
+  // to vector element/vector lane pairs. This avoids having to a modulo
+  // operator every time we want to apply the permutation to vector elements
+  for (uint32_t i = 0; i < k; i++) {
+    uint16_t vec_idx = perm_idx[i] / 4;
+    uint16_t vec_lane = perm_idx[i] % 4;
+    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
+  }
+
+  // Separate arrays to hold the betas of the trellis termination bits for the
+  // original and permuted inputs
+  float32x4_t beta_tail[2];
+  float32x4_t perm_beta_tail[2];
+
+  // Initialize alpha
+  alpha[0] = vdupq_n_f32(-INFINITY);
+  alpha[1] = vdupq_n_f32(-INFINITY);
+  alpha[0][0] = 0;
+
+  // Calculate the trellis termination state transition probabilities, which
+  // do not require any extrinsic information
+  trellis_termination(sys_f32.get(), par_f32.get(), k4, channel_reliability,
+                      beta_tail);
+  trellis_termination(perm_sys.get(), itl_f32.get(), k4, channel_reliability,
+                      perm_beta_tail);
+
+  // Initialize the number of iterations
+  uint32_t num_iter = 0;
+
+  while (num_iter < max_iter) {
+    // Run the first decoder step
+    decode_step(sys_f32.get(), par_f32.get(), extrinsic.get(), k4, l1_uky.get(),
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
+
+    // Compute the new extrinsic information to pass into the second decoder
+    update_extrinsic(k4, l1_uky.get(), extrinsic.get(), sys_f32.get());
+
+    // Need to unpermute extrinsic to match input to second decoder
+    for (uint32_t i = 0; i < k4; i++) {
+      for (uint32_t j = 0; j < 4; j++) {
+        perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 4 + j].first]
+                                        [perm_lookup[i * 4 + j].second];
+      }
+    }
+
+    // Run the second decoder step
+    decode_step(perm_sys.get(), itl_f32.get(), perm_extrinsic.get(), k4,
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
+
+    // Compute the new extrinsic information to pass back into the first encoder
+    update_extrinsic(k4, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
+
+    // But need to unpermute extrinsic first
+    for (uint32_t i = 0; i < k4; i++) {
+      for (uint32_t j = 0; j < 4; j++) {
+        extrinsic[perm_lookup[i * 4 + j].first][perm_lookup[i * 4 + j].second] =
+            perm_extrinsic[i][j];
+      }
+    }
+
+    // Compare this iteration's results with those from the previous iteration
+    float32_t max_abs_diff = 0.0;
+    float32_t max_abs_val = 0.0;
+    for (uint32_t i = 0; i < k4; i++) {
+      float32_t abs_diff = vmaxvq_f32(vabdq_f32(l2_uky[i], prev_l2_uky[i]));
+      float32_t abs_val = vmaxvq_f32(vabsq_f32(l2_uky[i]));
+      if (abs_diff > max_abs_diff) {
+        max_abs_diff = abs_diff;
+      }
+      if (abs_val > max_abs_val) {
+        max_abs_val = abs_val;
+      }
+    }
+
+    // If we've converged, finish decoding
+    if constexpr (check_convergence) {
+      if (max_abs_diff / max_abs_val <
+          std::numeric_limits<float32_t>::epsilon()) {
+        break;
+      }
+    }
+
+    // Store the current "final" LLRs to use in convergence checking next
+    // iteration
+    for (uint32_t i = 0; i < k4; i++) {
+      prev_l2_uky[i] = l2_uky[i];
+    }
+
+    num_iter++;
+  }
+
+  // Return unpermuted final output from second encoder
+  // Rather than allocate another new vector, copy into l1_uky and return that
+  for (uint32_t i = 0; i < k4; i++) {
+    for (uint32_t j = 0; j < 4; j++) {
+      l1_uky[perm_lookup[i * 4 + j].first][perm_lookup[i * 4 + j].second] =
+          l2_uky[i][j];
+    }
+  }
+
+  // Make a hard decision based on the final LLRs
+  turbo_llrs_to_bits(k, l1_uky.get(), dst);
+}
diff --git a/src/UpperPHY/Turbo/turbo_tables.hpp b/src/UpperPHY/Turbo/turbo_tables.hpp
index 1a59ae80dd3eb26095de8f4586fc0248e692aa0f..f2de1e22460f20daabc061038f5060c9ec89bb0b 100644
--- a/src/UpperPHY/Turbo/turbo_tables.hpp
+++ b/src/UpperPHY/Turbo/turbo_tables.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 7fd26f06203af680140ea0191cccd25ac937c3a1..5e35954996191493154414cb183a42fe4b1e3015 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp
index e664173251e336e809d5dbe25aec2a81d9d8fee5..1876dd20285c9386398c6e17aa777312f62f92ce 100644
--- a/src/utils/allocators.hpp
+++ b/src/utils/allocators.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/cmplx_arith_f32.hpp b/src/utils/cmplx_arith_f32.hpp
index 32644da766ed0299456c209b406a979437b3b4ae..87022ed2b3b07b13786b938b1223d1a5cba1d737 100644
--- a/src/utils/cmplx_arith_f32.hpp
+++ b/src/utils/cmplx_arith_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/vec_mul.hpp b/src/utils/vec_mul.hpp
index 2c4896a52029ed55ccee45b0397212ceec962a54..a352058800def11bacb5af0670a65f7a548ec767 100644
--- a/src/utils/vec_mul.hpp
+++ b/src/utils/vec_mul.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/test/MatrixInv/batch/main.cpp b/test/BasicMathFun/MatrixInv/Batch/main.cpp
similarity index 87%
rename from test/MatrixInv/batch/main.cpp
rename to test/BasicMathFun/MatrixInv/Batch/main.cpp
index 74b1fcdc61d2f969e73421bed25ce155321a7113..c8ff2e564119996ccdcc8bfe9fbf1fc7c6276033 100644
--- a/test/MatrixInv/batch/main.cpp
+++ b/test/BasicMathFun/MatrixInv/Batch/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -45,8 +45,9 @@ static void reference_parallel_matinv_block(uint32_t m,
  * generated input matrix
  */
 static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
-                                            bool is_hpd, float scale_re = 1.0F,
-                                            float scale_im = 1.0F) {
+                                            bool is_hpd,
+                                            float32_t scale_re = 1.0F,
+                                            float32_t scale_im = 1.0F) {
   printf("\n*****  test_batch_hermitian_matrix_%uX%u_rand () [BATCH] "
          "[armral_cmplx_f32_t], %s input matrix (scale={%.2f,%.2f}) *****\n",
          m, m, is_hpd ? "HPD" : "Hermitian", scale_re, scale_im);
@@ -82,9 +83,9 @@ static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_HER_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_HER_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -94,8 +95,8 @@ static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
  */
 static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
                                                bool is_hpd,
-                                               float scale_re = 1.0F,
-                                               float scale_im = 1.0F) {
+                                               float32_t scale_re = 1.0F,
+                                               float32_t scale_im = 1.0F) {
   printf("\n*****  test_batch_pa_hermitian_matrix_%uX%u_rand () [BATCH PA] "
          "[armral_cmplx_f32_t], %s input matrix (scale={%.2f,%.2f}) *****\n",
          m, m, is_hpd ? "HPD" : "Hermitian", scale_re, scale_im);
@@ -138,9 +139,9 @@ static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_HER_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_HER_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -149,8 +150,8 @@ static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
  * generated input matrix
  */
 static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
-                                  float scale_re = 1.0F,
-                                  float scale_im = 1.0F) {
+                                  float32_t scale_re = 1.0F,
+                                  float32_t scale_im = 1.0F) {
   printf(
       "\n*****  test_batch_matrix_%uX%u_rand () [BATCH] "
       "[armral_cmplx_f32_t], general input matrix (scale={%.2f,%.2f}) *****\n",
@@ -185,9 +186,9 @@ static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_GEN_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_GEN_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -196,8 +197,8 @@ static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
  * for randomly generated input matrix
  */
 static bool run_batch_pa_matinv_test(uint32_t batch_size, uint32_t m,
-                                     float scale_re = 1.0F,
-                                     float scale_im = 1.0F) {
+                                     float32_t scale_re = 1.0F,
+                                     float32_t scale_im = 1.0F) {
   printf(
       "\n*****  test_batch_pa_matrix_%uX%u_rand () [BATCH PA] "
       "[armral_cmplx_f32_t], general input matrix (scale={%.2f,%.2f}) *****\n",
@@ -240,9 +241,9 @@ static bool run_batch_pa_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_GEN_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_GEN_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
diff --git a/test/MatrixInv/single/main.cpp b/test/BasicMathFun/MatrixInv/Single/main.cpp
similarity index 83%
rename from test/MatrixInv/single/main.cpp
rename to test/BasicMathFun/MatrixInv/Single/main.cpp
index 9d7e3f935d8a0a6655b37035aff6f273e10f911c..480b5d70d987efcb47cad3ed881b87e38fc34297 100644
--- a/test/MatrixInv/single/main.cpp
+++ b/test/BasicMathFun/MatrixInv/Single/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -17,8 +17,8 @@
  * have linearly independent rows/cols and thus be invertible.
  */
 static bool run_hermitian_matinv_test(uint32_t m, bool enable_id_check,
-                                      bool is_hpd, float scale_re = 1.0F,
-                                      float scale_im = 1.0F) {
+                                      bool is_hpd, float32_t scale_re = 1.0F,
+                                      float32_t scale_im = 1.0F) {
   printf("\n*****  test_hermitian_matrix_%uX%u_rand () [SINGLE] "
          "[armral_cmplx_f32_t], "
          "%s input matrix (scale={%.2f,%.2f}) *****\n",
@@ -44,15 +44,15 @@ static bool run_hermitian_matinv_test(uint32_t m, bool enable_id_check,
     passed = false;
   }
 
-  passed &=
-      check_results_mat_inv("RAND_HER_MAT_INV", (float *)res.data(),
-                            (float *)ref.data(), 2 * m * m, (float)m, (float)m);
+  passed &= check_results_mat_inv("RAND_HER_MAT_INV", (float32_t *)res.data(),
+                                  (float32_t *)ref.data(), 2 * m * m,
+                                  (float32_t)m, (float32_t)m);
   return passed;
 }
 
 static bool run_general_matinv_test(uint32_t m, bool enable_id_check,
-                                    float scale_re = 1.0F,
-                                    float scale_im = 1.0F) {
+                                    float32_t scale_re = 1.0F,
+                                    float32_t scale_im = 1.0F) {
   printf("\n*****  test_general_mat_inverse_%uX%u_rand () [SINGLE] "
          "[armral_cmplx_f32_t], "
          "input matrix (scale={%.2f,%.2f}) *****\n",
@@ -77,9 +77,9 @@ static bool run_general_matinv_test(uint32_t m, bool enable_id_check,
     passed = false;
   }
 
-  passed &=
-      check_results_mat_inv("RAND_MAT_INV", (float *)res.data(),
-                            (float *)ref.data(), 2 * m * m, (float)m, (float)m);
+  passed &= check_results_mat_inv("RAND_MAT_INV", (float32_t *)res.data(),
+                                  (float32_t *)ref.data(), 2 * m * m,
+                                  (float32_t)m, (float32_t)m);
   return passed;
 }
 
diff --git a/test/MatrixMult/batch/ArmSolve/main.cpp b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
similarity index 97%
rename from test/MatrixMult/batch/ArmSolve/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
index 4ce72015f84188378fee65ba6947e28382ba621a..97d3776ed39da0881afae7357c7dc839eafad594 100644
--- a/test/MatrixMult/batch/ArmSolve/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "cs16_utils.hpp"
@@ -93,8 +93,8 @@ run_reference_solve(uint32_t num_sub_carrier, uint32_t sc_per_g,
                     armral_cmplx_int16_t *p_x, uint32_t p_xstride) {
   int total_cols = num_sub_carrier;
   for (int j = 0; j < total_cols; ++j) {
-    float *g_re = &p_g_real[j / sc_per_g];
-    float *g_im = &p_g_imag[j / sc_per_g];
+    float32_t *g_re = &p_g_real[j / sc_per_g];
+    float32_t *g_im = &p_g_imag[j / sc_per_g];
     for (int i = 0; i < X; ++i) {
       std::complex<double> acc = 0;
       for (int k = 0; k < Y; ++k) {
diff --git a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
similarity index 98%
rename from test/MatrixMult/batch/MatrixVectorMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
index c8e851027812cd1597c19d4636ddb314c6ac87b8..d47fc50aeef8214e2f07da1b9f724190a68eed66 100644
--- a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "matrix_utils.hpp"
diff --git a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
similarity index 97%
rename from test/MatrixMult/batch/MatrixVectorMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
index 41f3c73157c9bb88a79c015e5dc5b301c1268a04..9c56ee4bc3d6aa927e94fe906db8718ee8e484cc 100644
--- a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "cf32_utils.hpp"
diff --git a/test/MatrixMult/single/MatrixMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
index e882b7d522feaed18477e9e3c25aea585015f86e..e0322ac744a05e19c357cd3682f0f6334ab29d0a 100644
--- a/test/MatrixMult/single/MatrixMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "reference_linalg.hpp"
 
diff --git a/test/MatrixMult/single/MatrixMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
index a77c7f808222c5137cf6be444a199bea06f9e9bd..3c70e0046fd6c052970fe5e830dbfb5e958dd2b5 100644
--- a/test/MatrixMult/single/MatrixMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
@@ -44,8 +44,8 @@ static bool run_specific_2x2_iq_matmul_test() {
   const auto a_im = unpack_imag_cf32(a);
   const auto b_re = unpack_real_cf32(b);
   const auto b_im = unpack_imag_cf32(b);
-  std::vector<float> c_re(4);
-  std::vector<float> c_im(4);
+  std::vector<float32_t> c_re(4);
+  std::vector<float32_t> c_im(4);
   armral_cmplx_mat_mult_2x2_f32_iq(a_re.data(), a_im.data(), b_re.data(),
                                    b_im.data(), c_re.data(), c_im.data());
   const auto c = pack_cf32(c_re, c_im);
@@ -80,8 +80,8 @@ static bool run_specific_4x4_iq_matmul_test() {
   const auto a_im = unpack_imag_cf32(a);
   const auto b_re = unpack_real_cf32(b);
   const auto b_im = unpack_imag_cf32(b);
-  std::vector<float> c_re(16);
-  std::vector<float> c_im(16);
+  std::vector<float32_t> c_re(16);
+  std::vector<float32_t> c_im(16);
   armral_cmplx_mat_mult_4x4_f32_iq(a_re.data(), a_im.data(), b_re.data(),
                                    b_im.data(), c_re.data(), c_im.data());
   const auto c = pack_cf32(c_re, c_im);
diff --git a/test/MatrixMult/single/MatrixMultAAH32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
similarity index 88%
rename from test/MatrixMult/single/MatrixMultAAH32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
index 83c4e01989e0bb870d2e2543e0e85a33ddb04343..854bb264d9c297076c3e9229f8fc6ff412cef2a1 100644
--- a/test/MatrixMult/single/MatrixMultAAH32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "matrix_utils.hpp"
diff --git a/test/MatrixMult/single/MatrixMultAHB32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
similarity index 93%
rename from test/MatrixMult/single/MatrixMultAHB32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
index c09b5b7814fc3daeb8a4cfb7fe3cc26fae70d466..883f8bb50f5c68e0ab5253cf682f5732c71ebe96 100644
--- a/test/MatrixMult/single/MatrixMultAHB32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include <array>
diff --git a/test/MatrixMult/single/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixVectorMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
index f6d987d3ae82e11a3c171926b9d4f6730bf82e28..c8591934f77a182744adfb7b92199d8529c517a7 100644
--- a/test/MatrixMult/single/MatrixVectorMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "reference_linalg.hpp"
diff --git a/test/MatrixMult/single/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
similarity index 89%
rename from test/MatrixMult/single/MatrixVectorMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
index 4283a28e5e5a8a9663e58c7663309804c937268a..d18604838c2c754cef7e3da9e6a2d3b4d514eaf6 100644
--- a/test/MatrixMult/single/MatrixVectorMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
diff --git a/test/MatrixPseudoInv/direct/main.cpp b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
similarity index 82%
rename from test/MatrixPseudoInv/direct/main.cpp
rename to test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
index 74f99d8c64725f62dfca2de8a5798fb0e2c905b4..e644718aa1ef987b1daaafda8baad8d6da892b47 100644
--- a/test/MatrixPseudoInv/direct/main.cpp
+++ b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "matrix_utils.hpp"
@@ -27,7 +27,12 @@ reference_left_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
 
   // Compute B = C^(-1)
   std::vector<armral_cmplx_f32_t> mat_inv(n * n);
-  reference_matinv_block(n, mat_aha, mat_inv.data());
+  if (n == 1) {
+    mat_inv[0].re = 1.F / mat_aha[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    reference_matinv_block(n, mat_aha, mat_inv.data());
+  }
 
   // Compute B * A^H
   reference_matmul_bah_cf32(m, n, p_src, mat_inv.data(), p_dst);
@@ -50,7 +55,12 @@ static inline void reference_right_pseudo_inverse_direct(
 
   // Compute B = C^(-1)
   std::vector<armral_cmplx_f32_t> mat_inv(m * m);
-  reference_matinv_block(m, mat_aah, mat_inv.data());
+  if (m == 1) {
+    mat_inv[0].re = 1.F / mat_aah[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    reference_matinv_block(m, mat_aah, mat_inv.data());
+  }
 
   // Compute A^H * B
   reference_matmul_ahb_cf32(m, n, m, p_src, mat_inv.data(), p_dst);
@@ -89,13 +99,14 @@ bool run_all_tests(char const *test_name, char const *function_name,
   bool passed = true;
 
   const std::tuple<uint32_t, uint32_t, float32_t> params[] = {
-      {2, 5, -0.968591},   {2, 84, 0.191647},   {2, 2, 1.457848},
-      {2, 67, 0.0},        {3, 18, -1.218053},  {3, 138, 1.597186},
-      {3, 3, -1.2435186},  {3, 161, 0.0},       {4, 20, -0.474817},
-      {4, 105, 0.944802},  {4, 4, 1.645646},    {4, 94, 0.0},
-      {8, 35, -1.991369},  {8, 200, -1.244298}, {8, 8, 1.445767},
-      {8, 190, 0.0},       {16, 32, 0.809352},  {16, 80, 1.810591},
-      {16, 16, -0.426745}, {16, 117, 0.0}};
+      {1, 1, 0.186745},   {1, 21, -0.314205},  {1, 66, 1.495806},
+      {1, 121, 0.0},      {2, 5, -0.968591},   {2, 84, 0.191647},
+      {2, 2, 1.457848},   {2, 67, 0.0},        {3, 18, -1.218053},
+      {3, 138, 1.597186}, {3, 3, -1.2435186},  {3, 161, 0.0},
+      {4, 20, -0.474817}, {4, 105, 0.944802},  {4, 4, 1.645646},
+      {4, 94, 0.0},       {8, 35, -1.991369},  {8, 200, -1.244298},
+      {8, 8, 1.445767},   {8, 190, 0.0},       {16, 32, 0.809352},
+      {16, 80, 1.810591}, {16, 16, -0.426745}, {16, 117, 0.0}};
   for (const auto &[dim1, dim2, l] : params) {
     printf("[%s] m=%d, n=%d, l=%f\n", function_name, dim1, dim2, l);
     passed &= run_pseudo_inverse_direct_cf32_test(function_name, dim1, dim2, l,
diff --git a/test/VectorDotProd/vecDot16/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
similarity index 91%
rename from test/VectorDotProd/vecDot16/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
index 08126313dfae1a3c56b11242fcf3aa34f50f4818..6003f61f6ed7235d4b2f173aef148caff7f54f87 100644
--- a/test/VectorDotProd/vecDot16/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
similarity index 92%
rename from test/VectorDotProd/vecDot16_2/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
index 764b191e8956066bae74f6a80997b5f243be0804..8ac109254f275b79e3f4b7ff5569cecf407a7c96 100644
--- a/test/VectorDotProd/vecDot16_2/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_2_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
similarity index 93%
rename from test/VectorDotProd/vecDot16_2_32bit/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
index 8fa6efb56b6d5a2bef30a9e940e804d32e01e1a7..904b13fc84af93aac4c61325a557bc1649fa2fc3 100644
--- a/test/VectorDotProd/vecDot16_2_32bit/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
similarity index 92%
rename from test/VectorDotProd/vecDot16_32bit/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
index f400649380073fc2075d8bd17c5b669643aec7fa..b4784ff373faa3f0b30243f3014b7c37a32793b4 100644
--- a/test/VectorDotProd/vecDot16_32bit/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot32/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
similarity index 82%
rename from test/VectorDotProd/vecDot32/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
index beca6f0a43d55e64c410a5d74834311126bba96f..a72d8ea712e69d84b3dabefd79956a6347e5fe6c 100644
--- a/test/VectorDotProd/vecDot32/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -19,7 +19,7 @@ static bool run_vec_dot_test(uint32_t num_samples) {
   for (uint32_t i = 0; i < num_samples; ++i) {
     acc += cmplx_mul_widen_cf32(a[i], b[i]);
   }
-  armral_cmplx_f32_t ref{(float)acc.real(), (float)acc.imag()};
+  armral_cmplx_f32_t ref{(float32_t)acc.real(), (float32_t)acc.imag()};
 
   return check_results_cf32(NAME, c.data(), &ref, 1);
 }
diff --git a/test/VectorDotProd/vecDot32_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
similarity index 87%
rename from test/VectorDotProd/vecDot32_2/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
index b40850c6a3e676588b941ba3609f4361c274661a..232c14b93ff5b817547b982ee41bd41cfa4dec91 100644
--- a/test/VectorDotProd/vecDot32_2/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -28,7 +28,7 @@ static bool run_vec_dot_test(uint32_t num_samples) {
   for (uint32_t i = 0; i < num_samples; ++i) {
     acc += cmplx_mul_widen_cf32(a[i], b[i]);
   }
-  armral_cmplx_f32_t ref{(float)acc.real(), (float)acc.imag()};
+  armral_cmplx_f32_t ref{(float32_t)acc.real(), (float32_t)acc.imag()};
 
   return check_results_cf32(NAME, c.data(), &ref, 1);
 }
diff --git a/test/ElemWiseVectorMult/vecMul16/main.cpp b/test/BasicMathFun/VectorMult/VecMul16/main.cpp
similarity index 97%
rename from test/ElemWiseVectorMult/vecMul16/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul16/main.cpp
index ea5da35dbb6e2d5451311b176e04b577a4bc6ba4..aadebded13f118ba9d03c91da8739975eb2e0e05 100644
--- a/test/ElemWiseVectorMult/vecMul16/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/ElemWiseVectorMult/vecMul16_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
similarity index 97%
rename from test/ElemWiseVectorMult/vecMul16_2/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
index ed6cb76af5394a64d2875db08cccf8e2bf8da82b..3fa482bd300451aba75e90339cd1794a3f776852 100644
--- a/test/ElemWiseVectorMult/vecMul16_2/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/ElemWiseVectorMult/vecMul32/main.cpp b/test/BasicMathFun/VectorMult/VecMul32/main.cpp
similarity index 89%
rename from test/ElemWiseVectorMult/vecMul32/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul32/main.cpp
index 9ac33ac5adabe9fba26399604f2e085303100bda..0455a4264e4c3b3e7db42bc0c0f54768eb85fbeb 100644
--- a/test/ElemWiseVectorMult/vecMul32/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/ElemWiseVectorMult/vecMul32_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
similarity index 92%
rename from test/ElemWiseVectorMult/vecMul32_2/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
index 323367c28898d3d5da6c7dbec3232d736b63e588..bda9a5e70d7b82b04a6090c3546c5e9f53a1b906 100644
--- a/test/ElemWiseVectorMult/vecMul32_2/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/MuLaw/Compression/main.cpp b/test/DuRuInterface/MuLaw/Compression/main.cpp
similarity index 99%
rename from test/MuLaw/Compression/main.cpp
rename to test/DuRuInterface/MuLaw/Compression/main.cpp
index c1b8e6c4a4a840eac9a2976290ea1c1758edae2e..e107e1fbfafec7f5d82ad485d968160441c02792 100644
--- a/test/MuLaw/Compression/main.cpp
+++ b/test/DuRuInterface/MuLaw/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/MuLaw/Decompression/main.cpp b/test/DuRuInterface/MuLaw/Decompression/main.cpp
similarity index 98%
rename from test/MuLaw/Decompression/main.cpp
rename to test/DuRuInterface/MuLaw/Decompression/main.cpp
index 067af2dabb8fb6461eeb0159d7abc750202587b7..eb6ed52247b76db9011c93060f43b9bc1b144cd3 100644
--- a/test/MuLaw/Decompression/main.cpp
+++ b/test/DuRuInterface/MuLaw/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/XRanBlockFloat/Compression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
similarity index 99%
rename from test/XRanBlockFloat/Compression/main.cpp
rename to test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
index 43824bd810e59c0376068de9f9d5450b069e8291..ecf5d28123223500e9b71cb7b65c9907dd01264e 100644
--- a/test/XRanBlockFloat/Compression/main.cpp
+++ b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/XRanBlockFloat/Decompression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
similarity index 98%
rename from test/XRanBlockFloat/Decompression/main.cpp
rename to test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
index 4506d3bbf9afb7a6103cd4cddcd7b5623b5d02e0..087d3a00436677d9caefbb68b87ca8904fcad333 100644
--- a/test/XRanBlockFloat/Decompression/main.cpp
+++ b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ORanBlockScaling/Compression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
similarity index 99%
rename from test/ORanBlockScaling/Compression/main.cpp
rename to test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
index ac0356ce64e1057664ac78ffe53a1ddaeb375027..72d680c6c001f82a87831a78b54aab99d4d0f857 100644
--- a/test/ORanBlockScaling/Compression/main.cpp
+++ b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ORanBlockScaling/Decompression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
similarity index 98%
rename from test/ORanBlockScaling/Decompression/main.cpp
rename to test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
index 21744933a0b484259f908132cd141ba9a2d252b7..45cb5fecbaf54442b27a81511053aedda1c26718 100644
--- a/test/ORanBlockScaling/Decompression/main.cpp
+++ b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Correlation/main.cpp b/test/LowerPHY/Correlation/main.cpp
similarity index 97%
rename from test/Correlation/main.cpp
rename to test/LowerPHY/Correlation/main.cpp
index 192ba5f6d751e1198975fa3c90b7a24ca8fa373c..fa31a8bdd70ba149732738e1a96129d8a7b6b7ae 100644
--- a/test/Correlation/main.cpp
+++ b/test/LowerPHY/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/FFT/cs16/main.cpp b/test/LowerPHY/FFT/FFT16/main.cpp
similarity index 86%
rename from test/FFT/cs16/main.cpp
rename to test/LowerPHY/FFT/FFT16/main.cpp
index 6f88d577f0404ddf1c2565c74a077096a7a2aadc..33b4b980d399dfa9bbff0ff38910bd6b31238f49 100644
--- a/test/FFT/cs16/main.cpp
+++ b/test/LowerPHY/FFT/FFT16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -18,7 +18,7 @@
 
 float clamp_neg1_to_1(float x) {
   float low = -1.0;
-  float high = (float)((1 << 15) - 1) / (1 << 15);
+  float high = (float32_t)((1 << 15) - 1) / (1 << 15);
   return std::max(low, std::min(high, x));
 }
 
@@ -33,13 +33,13 @@ static bool check_fft_results(const char *name,
   float tol = FLT_EPSILON * (4 * n - 1);
   // since the final result is rounded to Q0.15 format, this is also a
   // potential source of large error (especially for smaller problem sizes).
-  tol = std::max((float)2 / (1 << 15), tol);
+  tol = std::max((float32_t)2 / (1 << 15), tol);
 
   for (uint32_t i = 0; i < n; ++i) {
-    auto res = std::complex<float>((float)result[i].re / (1 << 15),
-                                   (float)result[i].im / (1 << 15));
-    auto exp = std::complex<float>(clamp_neg1_to_1(expected[i].re),
-                                   clamp_neg1_to_1(expected[i].im));
+    auto res = std::complex<float32_t>((float32_t)result[i].re / (1 << 15),
+                                       (float32_t)result[i].im / (1 << 15));
+    auto exp = std::complex<float32_t>(clamp_neg1_to_1(expected[i].re),
+                                       clamp_neg1_to_1(expected[i].im));
     auto err = std::abs(res - exp);
     max_error = std::max(max_error, err);
     if (err > tol) {
diff --git a/test/FFT/cf32/main.cpp b/test/LowerPHY/FFT/FFT32/main.cpp
similarity index 92%
rename from test/FFT/cf32/main.cpp
rename to test/LowerPHY/FFT/FFT32/main.cpp
index ed8483c20430d7b86fa498cd6a8d7caa2a481f2b..bdec57c811f23d432b889b84ec96356e63a73da2 100644
--- a/test/FFT/cf32/main.cpp
+++ b/test/LowerPHY/FFT/FFT32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -27,8 +27,8 @@ static bool check_fft_results(const char *name,
   float tol = FLT_EPSILON * (4 * n) * 10;
 
   for (uint32_t i = 0; i < n; ++i) {
-    std::complex<float> res = {result[i].re, result[i].im};
-    std::complex<float> exp = {expected[i].re, expected[i].im};
+    std::complex<float32_t> res = {result[i].re, result[i].im};
+    std::complex<float32_t> exp = {expected[i].re, expected[i].im};
     float err = std::abs(res - exp);
     max_error = std::max(err, max_error);
     if (err > tol) {
diff --git a/test/FIR/arm_fir_filter_cs16/main.cpp b/test/LowerPHY/FIR/FIR16/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cs16/main.cpp
rename to test/LowerPHY/FIR/FIR16/main.cpp
index 7103678f65014a8f6c85f9e9d6264312d7dcd572..1cda4e8971cd3cf32d0c77a3eb643cfb0fc17ea6 100644
--- a/test/FIR/arm_fir_filter_cs16/main.cpp
+++ b/test/LowerPHY/FIR/FIR16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp
similarity index 95%
rename from test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
rename to test/LowerPHY/FIR/FIR16Decimate2/main.cpp
index a247b9882683651052b0a2971db1b1ad5ce028b0..ab179e5412e08833defd0a593b9981ced1b05d2b 100644
--- a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
+++ b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cf32/main.cpp b/test/LowerPHY/FIR/FIR32/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cf32/main.cpp
rename to test/LowerPHY/FIR/FIR32/main.cpp
index c8c3643f5fff2b05266aff5afd6244648e95f066..2112b9f35837cf2835b0c1d7dc8a54a6ffc8d75c 100644
--- a/test/FIR/arm_fir_filter_cf32/main.cpp
+++ b/test/LowerPHY/FIR/FIR32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
rename to test/LowerPHY/FIR/FIR32Decimate2/main.cpp
index 9c8c8e844470e1a7cb743b1f68cd9dfd295bebee..361e7056227169a5c75bd6552e1ebb520bc83aab 100644
--- a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
+++ b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/Scrambling/main.cpp b/test/LowerPHY/Scrambling/main.cpp
similarity index 93%
rename from test/Scrambling/main.cpp
rename to test/LowerPHY/Scrambling/main.cpp
index 777276f53062d7d6a0f063c4cbe04443d5c251aa..36ab300f25c6e40ba69e8799f653627be65ed6c9 100644
--- a/test/Scrambling/main.cpp
+++ b/test/LowerPHY/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/SeqGenerator/main.cpp b/test/LowerPHY/SeqGenerator/main.cpp
similarity index 95%
rename from test/SeqGenerator/main.cpp
rename to test/LowerPHY/SeqGenerator/main.cpp
index 8bb2f614f4a3e69899a4399d2c3c38dac3a10db3..70d50e62ecf8d3797c71c57be749fead10496ce2 100644
--- a/test/SeqGenerator/main.cpp
+++ b/test/LowerPHY/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/SVD/main.cpp b/test/MatrixFactorizations/SVD/main.cpp
similarity index 89%
rename from test/SVD/main.cpp
rename to test/MatrixFactorizations/SVD/main.cpp
index a1427907c19d2b14f0c4f4555e0e7b265b09917c..5fe0f85912b72fb2f186bc8d6eec251781c3f424 100644
--- a/test/SVD/main.cpp
+++ b/test/MatrixFactorizations/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "svd_sample_data.h"
@@ -9,13 +9,13 @@
 namespace {
 
 // Routine for converting a vector of armral_cmplx_f32_t
-// to a vector of complex<float>.
-std::vector<std::complex<float>>
+// to a vector of complex<float32_t>.
+std::vector<std::complex<float32_t>>
 convert_arm_cf32_to_complex(uint16_t nvalues,
                             const std::vector<armral_cmplx_f32_t> &a) {
-  std::vector<std::complex<float>> out(nvalues);
+  std::vector<std::complex<float32_t>> out(nvalues);
   for (unsigned i = 0; i < nvalues; ++i) {
-    out[i] = std::complex<float>(a[i].re, a[i].im);
+    out[i] = std::complex<float32_t>(a[i].re, a[i].im);
   }
   return out;
 }
@@ -36,7 +36,7 @@ bool test_svd_with_sample(SVDFunction svd_function_under_test) {
     int m = test.m;
     int size = m * n;
     std::vector<armral_cmplx_f32_t> a = test.a;
-    std::vector<float> s(n);
+    std::vector<float32_t> s(n);
 
     // Left and right singular vectors.
     std::vector<armral_cmplx_f32_t> u(size);
@@ -51,7 +51,7 @@ bool test_svd_with_sample(SVDFunction svd_function_under_test) {
     // values computed.
     passed &= check_singular_values(n, test.s, s);
 
-    // Convert data to complex<float> for testing
+    // Convert data to complex<float32_t> for testing
     auto aref_cmplx = convert_arm_cf32_to_complex(size, test.a);
     auto u_cmplx = convert_arm_cf32_to_complex(size, u);
     auto vt_cmplx = convert_arm_cf32_to_complex(n * n, vt);
@@ -75,8 +75,8 @@ bool test_svd(bool gen_singular_vectors, int m, int n, float cond,
   // Generate test matrix with prescribed
   // singular values and condition number
   std::vector<armral_cmplx_f32_t> a(size);
-  std::vector<float> s(n);
-  std::vector<float> sref(n);
+  std::vector<float32_t> s(n);
+  std::vector<float32_t> sref(n);
   int seed = 0;
   generate_svd_matrix(m, n, a, sref, cond, seed);
 
@@ -99,7 +99,7 @@ bool test_svd(bool gen_singular_vectors, int m, int n, float cond,
   bool passed = check_singular_values(n, sref, s);
 
   if (gen_singular_vectors) {
-    // Convert data to complex<float> for testing
+    // Convert data to complex<float32_t> for testing
     auto aref_cmplx = convert_arm_cf32_to_complex(size, aref);
     auto u_cmplx = convert_arm_cf32_to_complex(size, u);
     auto vt_cmplx = convert_arm_cf32_to_complex(n * n, vt);
@@ -129,7 +129,7 @@ bool run_all_tests(char const *name, SVDFunction svd_function) {
   std::vector<int> nb_row = {32, 50, 64, 128};
   std::vector<int> nb_col = {4, 8, 16, 20, 28, 32};
   std::vector<bool> check_full_decomposition = {true, false};
-  std::vector<float> cond_number{4, 32, 100, 100, 1000, 10000};
+  std::vector<float32_t> cond_number{4, 32, 100, 100, 1000, 10000};
   for (auto m : nb_row) {
     for (auto n : nb_col) {
       for (auto cond : cond_number) {
diff --git a/test/SVD/svd_sample_data.h b/test/MatrixFactorizations/SVD/svd_sample_data.h
similarity index 93%
rename from test/SVD/svd_sample_data.h
rename to test/MatrixFactorizations/SVD/svd_sample_data.h
index 0add7527886551250b129dfc5bb486270aac7049..51accb35df363d1433e1caff8fdfef2a5b4b1cfc 100644
--- a/test/SVD/svd_sample_data.h
+++ b/test/MatrixFactorizations/SVD/svd_sample_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -16,8 +16,8 @@ struct svd_test_param_t {
   uint32_t m;
   uint32_t n;
   std::vector<armral_cmplx_f32_t> a;
-  std::vector<float> s; // singular values
-  float cond;
+  std::vector<float32_t> s; // singular values
+  float32_t cond;
 };
 
 std::vector<svd_test_param_t> svd_sample_tests = {
diff --git a/test/SVD/svd_test.hpp b/test/MatrixFactorizations/SVD/svd_test.hpp
similarity index 85%
rename from test/SVD/svd_test.hpp
rename to test/MatrixFactorizations/SVD/svd_test.hpp
index a3ca48541b71b0f4d93dcf296ac70b46fab059fb..3cbcafbf43932384e3942171b739d1364dca9cfa 100644
--- a/test/SVD/svd_test.hpp
+++ b/test/MatrixFactorizations/SVD/svd_test.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -12,8 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#define SVD_TEST
-#include "SVD/matrix_view.hpp"
+#include "MatrixFactorizations/SVD/matrix_view.hpp"
 #include "cf32_utils.hpp"
 
 // In the accuracy tests, a computed solution
@@ -27,26 +26,26 @@
 // in division by a small floating point number.
 #define SAFEMIN 1.17549E-38
 
-typedef std::complex<float> cf32_t;
+typedef std::complex<float32_t> cf32_t;
 
 // Generate m-by-n, single complex random matrix
 static inline std::vector<cf32_t> generate_rand(const int m, const int n) {
 
   int size = m * n;
   std::vector<armral_cmplx_f32_t> a = allocate_random_cf32(size);
-  // Convert matrix to std::complex<float> type
+  // Convert matrix to std::complex<float32_t> type
   std::vector<cf32_t> out(size);
   for (int i = 0; i < size; ++i) {
-    out[i] = std::complex<float>(a[i].re, a[i].im);
+    out[i] = std::complex<float32_t>(a[i].re, a[i].im);
   }
   return out;
 }
 
-static inline float infinity_norm(int m, int n, const cf32_t *a) {
+static inline float32_t infinity_norm(int m, int n, const cf32_t *a) {
   column_major_matrix_view a_mat{a, m};
-  float inorm = 0;
+  float32_t inorm = 0;
   for (int i = 0; i < m; i++) {
-    float tmp = 0;
+    float32_t tmp = 0;
     for (int j = 0; j < n; j++) {
       tmp += std::abs(a_mat(i, j));
     }
@@ -59,7 +58,8 @@ static inline float infinity_norm(int m, int n, const cf32_t *a) {
 
 // Overload infinity_norm with an interface
 // with  std::vector data type as input
-static inline float infinity_norm(int m, int n, const std::vector<cf32_t> &a) {
+static inline float32_t infinity_norm(int m, int n,
+                                      const std::vector<cf32_t> &a) {
   return infinity_norm(m, n, a.data());
 }
 
@@ -72,7 +72,7 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
 
   cf32_t alpha = aii;
   // Sum of x[i] * conj(x[i])
-  float sum = 0.0F;
+  float32_t sum = 0.0F;
   for (int i = 0; i < n * incx; i += incx) {
     sum += real(x[i] * conj(x[i]));
   }
@@ -84,12 +84,12 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
   // Add alpha * conj(alpha) to sum
   // to compute the 2 norm of the full vector
   sum += real(alpha * conj(alpha));
-  float beta = -copysign(sqrt(sum), real(alpha));
-  float safmin = SAFEMIN / std::numeric_limits<float>::epsilon();
-  float rsafmin = 1.0F / safmin;
+  float32_t beta = -copysign(sqrt(sum), real(alpha));
+  float32_t safmin = SAFEMIN / std::numeric_limits<float32_t>::epsilon();
+  float32_t rsafmin = 1.0F / safmin;
   int cnt = 0;
   int max_attempt = 10;
-  float scale = 1.0F;
+  float32_t scale = 1.0F;
   // Check if beta is small enough to induce
   // overflow when taking the inverse, and
   // if it is the case, scale to avoid overflow
@@ -111,16 +111,16 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
 
   // Compute tau and update aii
   cf32_t tau = (beta - alpha) / beta;
-  cf32_t normalisation_factor = 1.0F / (alpha - beta);
+  cf32_t normalization_factor = 1.0F / (alpha - beta);
   for (int i = 0; i < n * incx; i += incx) {
-    x[i] = normalisation_factor * x[i];
+    x[i] = normalization_factor * x[i];
   }
   beta /= scale;
   aii = beta;
   return tau;
 }
 
-// householder_qr computes the QR factorisation A = QR.
+// householder_qr computes the QR factorization A = QR.
 // On exit, the elements on and above the diagonal
 // of the A contain the upper triangular matrix R.
 // The elements below the diagonal, with the array tau,
@@ -160,7 +160,7 @@ static inline void householder_qr(const int m, const int n, cf32_t *a,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the matrix A that has been factorized into QR or bidiagonalisation.
+// as the matrix A that has been factorized into QR or bidiagonalization.
 static inline void apply_q(int m, int n, const cf32_t *a, const cf32_t *tau,
                            cf32_t *c) {
   if (m < n) {
@@ -199,8 +199,8 @@ static inline void apply_q(int m, int n, const std::vector<cf32_t> &a,
   apply_q(m, n, a.data(), tau.data(), c.data());
 }
 
-// Generate explicitly Q from QR factorisation or from
-// the bidiagonalisation A = Q * B * P^H
+// Generate explicitly Q from QR factorization or from
+// the bidiagonalization A = Q * B * P^H
 static inline std::vector<cf32_t> get_q(const int m, const int n,
                                         const std::vector<cf32_t> &a,
                                         const std::vector<cf32_t> &tau) {
@@ -246,7 +246,7 @@ static inline std::vector<cf32_t> get_q(const int m, const int n,
 }
 
 // Generate the orthogonal matrix P from
-// the bidiagonalisation  A = Q * B * P^H,
+// the bidiagonalization  A = Q * B * P^H,
 // note that P^H is generated directly
 // instead of P
 static inline void get_p(int m, int n, const cf32_t *a, const cf32_t *tau,
@@ -322,18 +322,18 @@ static inline void get_p(int m, int n, const std::vector<cf32_t> &a,
 // singular values and condition number.
 // This routine first sets the singular values in
 // the array S, then generates two orthogonal matrices
-// Q1 and Q2 using QR factorisation, and form the
+// Q1 and Q2 using QR factorization, and form the
 // final matrix as Q 1* S * Q2.
 static inline void generate_svd_matrix(const int m, const int n,
                                        std::vector<armral_cmplx_f32_t> &a,
-                                       std::vector<float> &s, const float cond,
-                                       const int seed) {
+                                       std::vector<float32_t> &s,
+                                       const float32_t cond, const int seed) {
 
   // Generate singular values from 1 to 1/cond
   // where cond is the condition number of the matrix
   for (int i = 0; i < n; i++) {
-    float rcond = (1 - 1 / cond);
-    s[i] = 1 - (float)i / (n - 1) * rcond;
+    float32_t rcond = (1 - 1 / cond);
+    s[i] = 1 - (float32_t)i / (n - 1) * rcond;
   }
 
   srand(seed);
@@ -368,7 +368,7 @@ static inline void generate_svd_matrix(const int m, const int n,
   }
   apply_q(m, n, a1, tau1, a_cmplx);
 
-  // Convert vector<complex<float>> to vector<armral_cmplx_f32_t>
+  // Convert vector<complex<float32_t>> to vector<armral_cmplx_f32_t>
   for (int i = 0; i < m * n; ++i) {
     a[i] = {real(a_cmplx[i]), imag(a_cmplx[i])};
   }
@@ -389,9 +389,9 @@ static inline void generate_svd_matrix(const int m, const int n,
 // the bidiagonal matrix B. Note that this routine
 // returns directly the conjugate transpose of the
 // left orthogonal matrix.
-static inline void bidiagonalisation(const int m, const int n, cf32_t *a,
-                                     std::vector<float> &d,
-                                     std::vector<float> &e,
+static inline void bidiagonalization(const int m, const int n, cf32_t *a,
+                                     std::vector<float32_t> &d,
+                                     std::vector<float32_t> &e,
                                      std::vector<cf32_t> &tauq,
                                      std::vector<cf32_t> &taup) {
 
@@ -464,32 +464,33 @@ static inline void bidiagonalisation(const int m, const int n, cf32_t *a,
 }
 
 // Computation of Givens rotation components.
-inline static std::tuple<float, float, float> rotg(const float f,
-                                                   const float g) {
+inline static std::tuple<float32_t, float32_t, float32_t>
+rotg(const float32_t f, const float32_t g) {
   if (f == 0) {
-    float cs = 0.0F;
-    float sn = 1.0F;
+    float32_t cs = 0.0F;
+    float32_t sn = 1.0F;
     return std::make_tuple(cs, sn, g);
   }
   if (std::abs(f) > std::abs(g)) {
-    float t = g / f;
-    float tt = sqrt(1 + t * t);
-    float cs = 1 / tt;
-    float sn = t / tt;
+    float32_t t = g / f;
+    float32_t tt = sqrt(1 + t * t);
+    float32_t cs = 1 / tt;
+    float32_t sn = t / tt;
     return std::make_tuple(cs, sn, f * tt);
   }
-  float t = f / g;
-  float tt = sqrt(1 + t * t);
-  float sn = 1 / tt;
-  float cs = t / tt;
+  float32_t t = f / g;
+  float32_t tt = sqrt(1 + t * t);
+  float32_t sn = 1 / tt;
+  float32_t cs = t / tt;
   return std::make_tuple(cs, sn, g * tt);
 }
 
 // This routine updates singular vectors
 // by applying the Givens rotations
 // used to update the bidiagonal matrix
-inline static void update_sigvect(const int m, const float cs, const float sn,
-                                  cf32_t *v1, cf32_t *v2, const int incv) {
+inline static void update_sigvect(const int m, const float32_t cs,
+                                  const float32_t sn, cf32_t *v1, cf32_t *v2,
+                                  const int incv) {
   for (int i = 0; i < m * incv; i += incv) {
     cf32_t t = v1[i];
     v1[i] = cs * t + sn * v2[i];
@@ -510,9 +511,9 @@ inline static void update_sigvect(const int m, const float cs, const float sn,
 // "Singular Value Decomposition and Least Squares Solutions"
 //  published in Numer. Math. 14, 403--420 (1970).
 inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
-                                 const int n, std::vector<float> &d,
-                                 std::vector<float> &e, cf32_t *u, cf32_t *vt,
-                                 const int u_stride) {
+                                 const int n, std::vector<float32_t> &d,
+                                 std::vector<float32_t> &e, cf32_t *u,
+                                 cf32_t *vt, const int u_stride) {
 
   if (m < n) {
     // GCOVR_EXCL_START
@@ -530,14 +531,14 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
 
   // Compute the 1-norm of the bidiagonal matrix
   // for the computation of the stopping criteria.
-  float anorm = 0;
+  float32_t anorm = 0;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(d[i]) + std::abs(e[i]);
+    float32_t tmp = std::abs(d[i]) + std::abs(e[i]);
     if (anorm < tmp) {
       anorm = tmp;
     }
   }
-  float tol = THRESHOLD * anorm * std::numeric_limits<float>::epsilon();
+  float32_t tol = THRESHOLD * anorm * std::numeric_limits<float32_t>::epsilon();
 
   int maxiter = 2 * n;
   // Loop over the columns
@@ -567,16 +568,16 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
       // In this case, an extra sequence of Givens rotations is
       // applied from the left to annihilate the off-diagonal E[next_col].
       if (diag_is_zero) {
-        float cs = 0.0;
-        float sn = 1.0;
+        float32_t cs = 0.0;
+        float32_t sn = 1.0;
         for (int i = next_col; i < curr_col; i++) {
-          float f = sn * e[i];
+          float32_t f = sn * e[i];
           e[i] *= cs;
           if (std::abs(f) <= tol) {
             break;
           }
-          float g = d[i];
-          float h;
+          float32_t g = d[i];
+          float32_t h;
           std::tie(cs, sn, h) = rotg(f, g);
           d[i] = h;
           // Update left singular vectors.
@@ -586,7 +587,7 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
           }
         }
       }
-      float z = d[curr_col];
+      float32_t z = d[curr_col];
       if (next_col == curr_col) {
         // Make singular value nonnegative and update
         // the corresponding right singular vectors.
@@ -618,20 +619,20 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
       // the 2 eigenvalues are (d1 + d2)/2 +/- sqrt(((d1 - d2)/2)^2 + e1^2).
       // The choice of this shift accelerates the convergence of the
       // most bottom off-diagonal E[curr_col] to zero.
-      float x = d[next_col];
-      float y = d[curr_col - 1];
-      float g = e[curr_col - 1];
-      float h = e[curr_col];
+      float32_t x = d[next_col];
+      float32_t y = d[curr_col - 1];
+      float32_t g = e[curr_col - 1];
+      float32_t h = e[curr_col];
       // a^2 - b^2 operations are computed as
       // (a - b)* (a + b) to avoid overflow.
-      float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      float32_t f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = sqrt(f * f + 1);
       f = ((x - z) * (x + z) + h * (y / (f + copysign(g, f)) - h)) / x;
 
       // Shifted QR iteration, bulge chasing, applying
       // successive Givens rotations from right then from left.
-      float c = 1.0F;
-      float s = 1.0F;
+      float32_t c = 1.0F;
+      float32_t s = 1.0F;
       for (int i = next_col + 1; i <= curr_col; i++) {
         g = e[i];
         y = d[i];
@@ -693,18 +694,18 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A by first performing
-// the bidigonalisation of A, then computing
+// the bidigonalization of A, then computing
 // the SVD of the bidiagonal matrix and update
 // the singular vectors if required.
 static inline int svd_cf32(bool gen_singular_vect, const int m, const int n,
-                           std::vector<cf32_t> &a, std::vector<float> &s,
+                           std::vector<cf32_t> &a, std::vector<float32_t> &s,
                            std::vector<cf32_t> &u, std::vector<cf32_t> &vt) {
 
-  // Bidiagonalisation: A = Q * B * P^H.
+  // Bidiagonalization: A = Q * B * P^H.
   std::vector<cf32_t> tauq(n);
   std::vector<cf32_t> taup(n);
-  std::vector<float> e(n);
-  bidiagonalisation(m, n, a.data(), s, e, tauq, taup);
+  std::vector<float32_t> e(n);
+  bidiagonalization(m, n, a.data(), s, e, tauq, taup);
 
   // Generate left and right orthogonal vectors if required.
   if (gen_singular_vect) {
@@ -722,14 +723,14 @@ static inline int svd_cf32(bool gen_singular_vect, const int m, const int n,
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A in 4 steps.
-// 1- QR factorisation of A.
-// 2- Bidiagonalisation of R.
+// 1- QR factorization of A.
+// 2- Bidiagonalization of R.
 // 3- SVD of the bidigonal matrix from R.
 // 4- Update of the left singular vectors
 // with the orthogonal matrix from QR.
 static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
                               const int n, std::vector<cf32_t> &a,
-                              std::vector<float> &s, std::vector<cf32_t> &u,
+                              std::vector<float32_t> &s, std::vector<cf32_t> &u,
                               std::vector<cf32_t> &vt) {
   column_major_matrix_view a_mat{a.data(), m};
 
@@ -745,18 +746,18 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
       r_mat(i, j) = a_mat(i, j);
     }
   }
-  // Bidiagonalisation of R.
+  // Bidiagonalization of R.
   std::vector<cf32_t> tauq(n);
   std::vector<cf32_t> taup(n);
-  std::vector<float> e(n);
-  bidiagonalisation(n, n, r.data(), s, e, tauq, taup);
+  std::vector<float32_t> e(n);
+  bidiagonalization(n, n, r.data(), s, e, tauq, taup);
 
   // Generate left and right orthogonal vectors.
   if (gen_singular_vect) {
     // Generate Q, and store it in u1.
     std::vector<cf32_t> u1 = get_q(n, n, r, tauq);
     // Copy u1 in u
-    // Initialise u to zero in case it is not.
+    // Initialize u to zero in case it is not.
     u.assign(u.size(), 0.0F);
     column_major_matrix_view u_mat{u.data(), m};
     column_major_matrix_view u1_mat{u1.data(), n};
@@ -782,7 +783,7 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
 // Check ||Id - Q^H * Q||_∞/n < THRESHOLD * epsilon
 static inline bool check_orthogonality(const int m, const int n, cf32_t *q) {
 
-  float tol = THRESHOLD * n * std::numeric_limits<float>::epsilon();
+  float32_t tol = THRESHOLD * n * std::numeric_limits<float32_t>::epsilon();
 
   // Build an identity matrix Id
   std::vector<cf32_t> a(n * n);
@@ -800,7 +801,7 @@ static inline bool check_orthogonality(const int m, const int n, cf32_t *q) {
     }
   }
   // Compute the infinity norm ||Id - Q^H * Q||_∞
-  float inorm = infinity_norm(n, n, a);
+  float32_t inorm = infinity_norm(n, n, a);
   return inorm < tol;
 }
 
@@ -819,9 +820,10 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
   column_major_matrix_view a_mat{a, m};
 
   // Infinity norm of Aref
-  float anorm = infinity_norm(m, n, aref);
+  float32_t anorm = infinity_norm(m, n, aref);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // Extract R, allocate m-by-n memory for
   // the multiplication by A later
@@ -847,7 +849,7 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
     }
   }
   // Compute the norm of Aref - Q * R
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -890,16 +892,16 @@ static inline void matmul(int m, int n, int k, const std::vector<cf32_t> &a,
 // Check || A - Q * B * P^H||_∞/ || A ||_∞ < tol
 // where B is an upper bidiagonal matrix with diagonal
 // entries in D, and superdiagonal entries in E.
-static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
-                                              const cf32_t *a, const float *d,
-                                              const float *e,
-                                              const cf32_t *tauq,
-                                              const cf32_t *taup) {
+static inline bool
+check_bidiag_decomposition(int m, int n, const cf32_t *aref, const cf32_t *a,
+                           const float32_t *d, const float32_t *e,
+                           const cf32_t *tauq, const cf32_t *taup) {
 
   // Infinity norm of aref
-  float anorm = infinity_norm(m, n, aref);
+  float32_t anorm = infinity_norm(m, n, aref);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // Generate right orthogonal vectors
 
@@ -931,7 +933,7 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
     }
   }
   // Infinity norm of error
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -939,19 +941,19 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
 // with std::vector data type as input
 static inline bool check_bidiag_decomposition(
     int m, int n, const std::vector<cf32_t> &aref, const std::vector<cf32_t> &a,
-    const std::vector<float> &d, const std::vector<float> &e,
+    const std::vector<float32_t> &d, const std::vector<float32_t> &e,
     const std::vector<cf32_t> &tauq, const std::vector<cf32_t> &taup) {
 
   return check_bidiag_decomposition(m, n, aref.data(), a.data(), d.data(),
                                     e.data(), tauq.data(), taup.data());
 }
 
-static inline bool check_singular_values(int n, const float *sref,
-                                         const float *s) {
-  float tol = THRESHOLD * n * std::numeric_limits<float>::epsilon();
-  float error = 0.0F;
+static inline bool check_singular_values(int n, const float32_t *sref,
+                                         const float32_t *s) {
+  float32_t tol = THRESHOLD * n * std::numeric_limits<float32_t>::epsilon();
+  float32_t error = 0.0F;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(sref[i] - s[i]);
+    float32_t tmp = std::abs(sref[i] - s[i]);
     if (tmp > error) {
       error = tmp;
     }
@@ -961,21 +963,23 @@ static inline bool check_singular_values(int n, const float *sref,
 
 // Overload check_singular_values with an interface
 // with std::vector data type as inputs
-static inline bool check_singular_values(int n, const std::vector<float> &sref,
-                                         const std::vector<float> &s) {
+static inline bool check_singular_values(int n,
+                                         const std::vector<float32_t> &sref,
+                                         const std::vector<float32_t> &s) {
   return check_singular_values(n, sref.data(), s.data());
 }
 
 // Check the accuracy of SVD decomposition
 //  error = ||A - U * S *VT^H||_∞/ (||A||_∞ * m)
 static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
-                                           const float *s, const cf32_t *u,
+                                           const float32_t *s, const cf32_t *u,
                                            const cf32_t *vt) {
 
   // Infinity norm of A
-  float anorm = infinity_norm(m, n, a);
+  float32_t anorm = infinity_norm(m, n, a);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // U1 = U * S
   std::vector<cf32_t> u1(m * n);
@@ -994,7 +998,7 @@ static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
   matmul(m, n, n, u1.data(), vt, -1.0F, c.data());
 
   // Compute the infinity norm ||A  - U * S * VT^H||_oo
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -1002,7 +1006,7 @@ static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
 // with std::vector data type as inputs
 static inline bool check_svd_decomposition(int m, int n,
                                            const std::vector<cf32_t> &a,
-                                           const std::vector<float> &s,
+                                           const std::vector<float32_t> &s,
                                            const std::vector<cf32_t> &u,
                                            const std::vector<cf32_t> &vt) {
   return check_svd_decomposition(m, n, a.data(), s.data(), u.data(), vt.data());
diff --git a/test/CRC/main.cpp b/test/UpperPHY/CRC/main.cpp
similarity index 97%
rename from test/CRC/main.cpp
rename to test/UpperPHY/CRC/main.cpp
index ed3941a277f3ef9e73ceccf7aab095fc3eaa501d..47a4d9b4e6bf718455f7b844374fd3f8eb44783c 100644
--- a/test/CRC/main.cpp
+++ b/test/UpperPHY/CRC/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ConvCoding/decoding/main.cpp b/test/UpperPHY/ConvolutionalDecoder/main.cpp
similarity index 97%
rename from test/ConvCoding/decoding/main.cpp
rename to test/UpperPHY/ConvolutionalDecoder/main.cpp
index d768fbe6bce1dd611dc39fc56d5ef36a869a44e0..dcebd77001e8343e5a536078c0d942d3e9d78c57 100644
--- a/test/ConvCoding/decoding/main.cpp
+++ b/test/UpperPHY/ConvolutionalDecoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/ConvCoding/encoding/main.cpp b/test/UpperPHY/ConvolutionalEncoder/main.cpp
similarity index 96%
rename from test/ConvCoding/encoding/main.cpp
rename to test/UpperPHY/ConvolutionalEncoder/main.cpp
index fab64d1888c66ce983789cf74e5496f9442282de..640bd1f36db42399b655a8d65e009f21268f0fd9 100644
--- a/test/ConvCoding/encoding/main.cpp
+++ b/test/UpperPHY/ConvolutionalEncoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/Demodulation/main.cpp b/test/UpperPHY/Demodulation/main.cpp
similarity index 97%
rename from test/Demodulation/main.cpp
rename to test/UpperPHY/Demodulation/main.cpp
index 4833b650017f4b24b1bfb7badfaf1c4d16db469c..bfc68be505a3c72f79d2d6a91ecbecfb0050283e 100644
--- a/test/Demodulation/main.cpp
+++ b/test/UpperPHY/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
@@ -108,7 +108,7 @@ static void demodulation_64qam_ref(const uint32_t n_symbols, const uint16_t ulp,
   // proportional to an offset of the modulated symbol received
   uint16_t weight = (1 << 15) / ulp;
   // The amplitudes are in {1, 3, 5, 7} / sqrt(42)
-  // These are organised using a Gray encoding, in the following manner
+  // These are organized using a Gray encoding, in the following manner
   // 01 -> 1/sqrt(42)
   // 00 -> 3/sqrt(42)
   // 10 -> 5/sqrt(42)
@@ -148,7 +148,7 @@ static void demodulation_256qam_ref(const uint32_t n_symbols,
   uint16_t weight = (1 << 15) / ulp;
 
   // The amplitudes are in {1, 3, 5, 7, 9, 11, 13, 15} / sqrt(170)
-  // These are organised in a Gray encoding, and we can get the
+  // These are organized in a Gray encoding, and we can get the
   // log-likelihood ratios by performing the following operations
   // for each of the 8 bits encoded in the symbol s = {s.re, s.im}
   // LLR(b0|s) = weight * -s.re
diff --git a/test/LDPC/decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp
similarity index 98%
rename from test/LDPC/decoding/main.cpp
rename to test/UpperPHY/LDPC/Decoding/main.cpp
index 9362a05146abc765d3da0d3393e5c80f0c08d454..f557ea25fe5fb9d0bf880858061c7c6bc90a2620 100644
--- a/test/LDPC/decoding/main.cpp
+++ b/test/UpperPHY/LDPC/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "../ldpc_test_common.hpp"
diff --git a/test/LDPC/encoding/ldpc_encoding_test_data.h b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
similarity index 98%
rename from test/LDPC/encoding/ldpc_encoding_test_data.h
rename to test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
index 09947ba6b610fca5e071cc850027d84f9d4dbb72..4a94f7a4937702dea45e2c39ebab6ffb2b82fe0a 100644
--- a/test/LDPC/encoding/ldpc_encoding_test_data.h
+++ b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/LDPC/encoding/main.cpp b/test/UpperPHY/LDPC/Encoding/main.cpp
similarity index 98%
rename from test/LDPC/encoding/main.cpp
rename to test/UpperPHY/LDPC/Encoding/main.cpp
index 8b78ea40bc6dd034ee3a67e4db8bdf523fd150eb..59ba4e3c6fa1b77222fb2a0d61581b8838efed23 100644
--- a/test/LDPC/encoding/main.cpp
+++ b/test/UpperPHY/LDPC/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "../ldpc_test_common.hpp"
 #include "armral.h"
@@ -195,7 +195,7 @@ std::vector<uint8_t> armral_ldpc_encode_block_ref(const uint8_t *data_in,
   // Perform the encoding
   // We need to invert a system of equations for the
   // first 4 * z rows, which correspond to the high-density
-  // sub-matrix portion. Initialisation is to zero
+  // sub-matrix portion. Initialization is to zero
   std::vector<uint8_t> parity_hdsm(4 * z);
 
   // Rename a variable for clarity how it is used in this function
diff --git a/test/LDPC/rate_matching/main.cpp b/test/UpperPHY/LDPC/RateMatching/main.cpp
similarity index 99%
rename from test/LDPC/rate_matching/main.cpp
rename to test/UpperPHY/LDPC/RateMatching/main.cpp
index 783c882a2b825dcf994a43c81fd43eafcffa507e..ceb726744a69a271ed41bdbdc6aad8281d968b6c 100644
--- a/test/LDPC/rate_matching/main.cpp
+++ b/test/UpperPHY/LDPC/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/LDPC/rate_recovery/main.cpp b/test/UpperPHY/LDPC/RateRecovery/main.cpp
similarity index 99%
rename from test/LDPC/rate_recovery/main.cpp
rename to test/UpperPHY/LDPC/RateRecovery/main.cpp
index 499d98bf3e41b99e63a6b5b5e2cb4bdaa99bf232..993b08e5922f21677bb31b699a0daf60360b7f40 100644
--- a/test/LDPC/rate_recovery/main.cpp
+++ b/test/UpperPHY/LDPC/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
@@ -215,7 +215,6 @@ bool test_ref_rate_recovery() {
   passed &= std::equal(out.begin(), out.begin() + k0, in.begin() + (e - k0));
 
   // Test selection process with shortening
-
   e = 80;
   n = 100;
   k0 = 16;
diff --git a/test/LDPC/ldpc_test_common.hpp b/test/UpperPHY/LDPC/ldpc_test_common.hpp
similarity index 95%
rename from test/LDPC/ldpc_test_common.hpp
rename to test/UpperPHY/LDPC/ldpc_test_common.hpp
index 0623f9f5976381200498ee49c008c6030373072b..2b8d4a97c986db42abcc4b13cd5700331f4cba4a 100644
--- a/test/LDPC/ldpc_test_common.hpp
+++ b/test/UpperPHY/LDPC/ldpc_test_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/test/Modulation/main.cpp b/test/UpperPHY/Modulation/main.cpp
similarity index 99%
rename from test/Modulation/main.cpp
rename to test/UpperPHY/Modulation/main.cpp
index 0cb0a3ad89f10fdbfbac25dd68edf7d240a1b9d6..2fff2b6aeb44dfb867e2d4908574d0a0031b3407 100644
--- a/test/Modulation/main.cpp
+++ b/test/UpperPHY/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/Polar/crc_attachment/main.cpp b/test/UpperPHY/Polar/CrcAttachment/main.cpp
similarity index 93%
rename from test/Polar/crc_attachment/main.cpp
rename to test/UpperPHY/Polar/CrcAttachment/main.cpp
index 8d67cd49cd15fcdd77b407a944f39de441ec920e..21f1f71606143eede8c175d1beef5188c0893548 100644
--- a/test/Polar/crc_attachment/main.cpp
+++ b/test/UpperPHY/Polar/CrcAttachment/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "int8_utils.hpp"
 #include "polar_crc_attach_data.hpp"
diff --git a/test/Polar/crc_attachment/polar_crc_attach_data.hpp b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
similarity index 85%
rename from test/Polar/crc_attachment/polar_crc_attach_data.hpp
rename to test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
index cb8c986ddb2ce7a083167b86853d4a30ea01fa0a..555f4af61e61948ae40b2f804ac5da10c09b352c 100644
--- a/test/Polar/crc_attachment/polar_crc_attach_data.hpp
+++ b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Polar/decoding/main.cpp b/test/UpperPHY/Polar/Decoding/main.cpp
similarity index 97%
rename from test/Polar/decoding/main.cpp
rename to test/UpperPHY/Polar/Decoding/main.cpp
index 5b36846ddbd5b7de3ebe25cf5d4b493e00e2d21a..e6f48ac12a5924cd456ba953b4866cc2f4b50a52 100644
--- a/test/Polar/decoding/main.cpp
+++ b/test/UpperPHY/Polar/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/encoding/main.cpp b/test/UpperPHY/Polar/Encoding/main.cpp
similarity index 92%
rename from test/Polar/encoding/main.cpp
rename to test/UpperPHY/Polar/Encoding/main.cpp
index 7c1d9ac2dbf51b3501a57267e5132c7725f356d5..ae53d57eaa197345f98dd468de4d51370569ef27 100644
--- a/test/Polar/encoding/main.cpp
+++ b/test/UpperPHY/Polar/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/frozen/main.cpp b/test/UpperPHY/Polar/Frozen/main.cpp
similarity index 98%
rename from test/Polar/frozen/main.cpp
rename to test/UpperPHY/Polar/Frozen/main.cpp
index 5be4671fac38eebb6ac2446c8f16a6e75e3b627f..341383fb36463eff4d7cf72b6a59b7c11e33b2a2 100644
--- a/test/Polar/frozen/main.cpp
+++ b/test/UpperPHY/Polar/Frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_matching/main.cpp b/test/UpperPHY/Polar/RateMatching/main.cpp
similarity index 98%
rename from test/Polar/rate_matching/main.cpp
rename to test/UpperPHY/Polar/RateMatching/main.cpp
index 6afd323d704b83d6e7c6c17516603359efc76d33..4bbc0179921603a690cc5cab5ad1afc5be4c7eef 100644
--- a/test/Polar/rate_matching/main.cpp
+++ b/test/UpperPHY/Polar/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_recovery/main.cpp b/test/UpperPHY/Polar/RateRecovery/main.cpp
similarity index 98%
rename from test/Polar/rate_recovery/main.cpp
rename to test/UpperPHY/Polar/RateRecovery/main.cpp
index fe3ce7d4069852f11a0177b4999cde83e8ce9313..8d7d5b743b2957328de048c801ab55b0c11efea0 100644
--- a/test/Polar/rate_recovery/main.cpp
+++ b/test/UpperPHY/Polar/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/subchannel_deinterleave/main.cpp b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
similarity index 92%
rename from test/Polar/subchannel_deinterleave/main.cpp
rename to test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
index 59b3ef8e890fdd0b45561c86c7829cffe4e865dc..0b0840f4a64b225c60ed84dfdd47ddb49ff17d8c 100644
--- a/test/Polar/subchannel_deinterleave/main.cpp
+++ b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/subchannel_interleave/main.cpp b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp
similarity index 95%
rename from test/Polar/subchannel_interleave/main.cpp
rename to test/UpperPHY/Polar/SubchannelInterleave/main.cpp
index bfc6e55d6277d9abe4480203b89f3e152ff1f583..4a83cb0d2dd0a77b7b194a6a332aa15654c5dbe9 100644
--- a/test/Polar/subchannel_interleave/main.cpp
+++ b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/decoding/main.cpp b/test/UpperPHY/Turbo/Decoding/main.cpp
similarity index 96%
rename from test/Turbo/decoding/main.cpp
rename to test/UpperPHY/Turbo/Decoding/main.cpp
index 027056ed5b2768b88f916b0a7a52d26ec0a975fd..af4d929785fc0465866480e0441ab96bc3d67a82 100644
--- a/test/Turbo/decoding/main.cpp
+++ b/test/UpperPHY/Turbo/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -11,7 +11,7 @@
 #include <vector>
 
 // Check that the decoder returns the expected error code when
-// passed an invalid value of k. We can safely pass uninitialised
+// passed an invalid value of k. We can safely pass uninitialized
 // memory to the routine as the parameter test is the first thing
 // it does and it will return immediately when k is invalid.
 static bool run_turbo_decoding_parameter_test() {
diff --git a/test/Turbo/encoding/main.cpp b/test/UpperPHY/Turbo/Encoding/main.cpp
similarity index 95%
rename from test/Turbo/encoding/main.cpp
rename to test/UpperPHY/Turbo/Encoding/main.cpp
index b945f22727afb5669639ab84b82aa5ae7da1c6a2..072218cfb4e688a3d589c4fbdc5ebed5cdd641a1 100644
--- a/test/Turbo/encoding/main.cpp
+++ b/test/UpperPHY/Turbo/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -12,7 +12,7 @@
 #include <vector>
 
 // Check that the encoder returns the expected error code when
-// passed an invalid value of k. We can safely pass uninitialised
+// passed an invalid value of k. We can safely pass uninitialized
 // memory to the routine as the parameter test is the first thing
 // it does and it will return immediately when k is invalid.
 static bool run_turbo_encoding_parameter_test() {
diff --git a/test/Turbo/encoding/reference_turbo_encoder.hpp b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
similarity index 99%
rename from test/Turbo/encoding/reference_turbo_encoder.hpp
rename to test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
index aec3668e73f782d509e85a84e4bf42729a3e7035..451b6b2dd7fc73c1e9f4a86cea4cec7f45892ef5 100644
--- a/test/Turbo/encoding/reference_turbo_encoder.hpp
+++ b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/rate_matching/main.cpp b/test/UpperPHY/Turbo/RateMatching/main.cpp
similarity index 99%
rename from test/Turbo/rate_matching/main.cpp
rename to test/UpperPHY/Turbo/RateMatching/main.cpp
index be9c29f4bfca60a57d9b6e0254c522df742bab32..4353ab555a2c5fe4c89ff2091f2468a7d9c0b488 100644
--- a/test/Turbo/rate_matching/main.cpp
+++ b/test/UpperPHY/Turbo/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/main.cpp b/test/UpperPHY/Turbo/RateRecovery/main.cpp
similarity index 94%
rename from test/Turbo/rate_recovery/main.cpp
rename to test/UpperPHY/Turbo/RateRecovery/main.cpp
index 36c748b9d7da78707ba82e1940f025ec59b3075d..5f91d3aa4770f43fe5c10c456dbe2a07854b2a75 100644
--- a/test/Turbo/rate_recovery/main.cpp
+++ b/test/UpperPHY/Turbo/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/rate_recovery_data.hpp b/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
similarity index 99%
rename from test/Turbo/rate_recovery/rate_recovery_data.hpp
rename to test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
index 0117f1481118fcf05c2c412a3436a35d42afc8c0..361289e217e2d35d9b29421f786fd7f05f69d7e3 100644
--- a/test/Turbo/rate_recovery/rate_recovery_data.hpp
+++ b/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp
similarity index 89%
rename from test/Turbo/turbo_test_data.hpp
rename to test/UpperPHY/Turbo/turbo_test_data.hpp
index cc47d69638e1c42eccdb309f021fa9cf4c18cbfa..4507b9bb4d56482b1cc1c29b677a3e1a82914371 100644
--- a/test/Turbo/turbo_test_data.hpp
+++ b/test/UpperPHY/Turbo/turbo_test_data.hpp
@@ -1,14 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include "rng.hpp"
 
 static void generate_turbo_test_data(uint8_t *src, uint32_t k) {
-  static linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({k});
+  static armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({k});
 
   // k is always divisible by 8
   uint32_t k_bytes = k >> 3;
diff --git a/utils/bit_utils.hpp b/utils/bit_utils.hpp
index 0de9b1c46cb4350c5ea0f1f53ff2e3e5e37e7f20..1ed60cfdc14cbe12fc3bac8f5bffad537a191523 100644
--- a/utils/bit_utils.hpp
+++ b/utils/bit_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -97,7 +97,7 @@ static inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
 
 // Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
 // negative, otherwise to 0. We do not assume that the data_out pointer is
-// initialised
+// initialized
 template<typename T>
 static inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
   uint32_t full_bytes = n >> 3;
diff --git a/utils/cf32_utils.hpp b/utils/cf32_utils.hpp
index 41f2c4f52ace05165b40b8a1bfb93f0f482fc3cf..f54c10e945678207e8b0f73beee4b4f8d9fedc7d 100644
--- a/utils/cf32_utils.hpp
+++ b/utils/cf32_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -43,15 +43,15 @@
 class cf32_random {
 public:
   cf32_random(std::initializer_list<uint64_t> seeds = {42})
-    : m_state(random_state::from_seeds(seeds)) {}
+    : m_state(armral::utils::random_state::from_seeds(seeds)) {}
 
   static constexpr armral_cmplx_f32_t default_min{1.0F, 2.0F};
   static constexpr armral_cmplx_f32_t default_max{2.0F, 4.0F};
 
   armral_cmplx_f32_t one(armral_cmplx_f32_t min = default_min,
                          armral_cmplx_f32_t max = default_max) {
-    return armral_cmplx_f32_t{rand<float>(min.re, max.re),
-                              rand<float>(min.im, max.im)};
+    return armral_cmplx_f32_t{rand<float32_t>(min.re, max.re),
+                              rand<float32_t>(min.im, max.im)};
   }
 
   std::vector<armral_cmplx_f32_t> vector(size_t len,
@@ -65,11 +65,11 @@ public:
   }
 
   std::vector<armral_cmplx_f32_t> &
-  flip_signs(std::vector<armral_cmplx_f32_t> &vector, float chance_re = 0.5F,
-             float chance_im = 0.5F) {
+  flip_signs(std::vector<armral_cmplx_f32_t> &vector,
+             float32_t chance_re = 0.5F, float32_t chance_im = 0.5F) {
     for (auto &cmplx : vector) {
-      bool re_flip = rand<float>(0, 1) < chance_re;
-      bool im_flip = rand<float>(0, 1) < chance_im;
+      bool re_flip = rand<float32_t>(0, 1) < chance_re;
+      bool im_flip = rand<float32_t>(0, 1) < chance_im;
       cmplx.re = re_flip ? -cmplx.re : cmplx.re;
       cmplx.im = im_flip ? -cmplx.im : cmplx.im;
     }
@@ -77,20 +77,20 @@ public:
   }
 
   std::vector<armral_cmplx_f32_t>
-  flip_signs(std::vector<armral_cmplx_f32_t> &&vector, float chance_re = 0.5F,
-             float chance_im = 0.5F) {
+  flip_signs(std::vector<armral_cmplx_f32_t> &&vector,
+             float32_t chance_re = 0.5F, float32_t chance_im = 0.5F) {
     auto result = std::move(vector);
     return flip_signs(result);
   }
 
 private:
   template<typename T>
-  float rand(float min, float max) {
-    linear_congruential_generator lcg;
+  float32_t rand(float32_t min, float32_t max) {
+    armral::utils::linear_congruential_generator lcg;
     return lcg.one<T>(&m_state, min, max);
   }
 
-  random_state m_state;
+  armral::utils::random_state m_state;
 };
 
 static inline std::vector<armral_cmplx_f32_t>
@@ -126,7 +126,7 @@ narrow_to_cf32(const std::vector<std::complex<double>> &a) {
 }
 
 static inline std::vector<armral_cmplx_f32_t>
-pack_cf32(const std::vector<float> &re, const std::vector<float> &im) {
+pack_cf32(const std::vector<float32_t> &re, const std::vector<float32_t> &im) {
   assert(re.size() == im.size());
   std::vector<armral_cmplx_f32_t> ret(re.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
@@ -135,18 +135,18 @@ pack_cf32(const std::vector<float> &re, const std::vector<float> &im) {
   return ret;
 }
 
-static inline std::vector<float>
+static inline std::vector<float32_t>
 unpack_real_cf32(const std::vector<armral_cmplx_f32_t> &in) {
-  std::vector<float> ret(in.size());
+  std::vector<float32_t> ret(in.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
     ret[i] = in[i].re;
   }
   return ret;
 }
 
-static inline std::vector<float>
+static inline std::vector<float32_t>
 unpack_imag_cf32(const std::vector<armral_cmplx_f32_t> &in) {
-  std::vector<float> ret(in.size());
+  std::vector<float32_t> ret(in.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
     ret[i] = in[i].im;
   }
@@ -172,26 +172,27 @@ unpack_imag_cf32(const std::vector<armral_cmplx_f32_t> &in) {
  *
  * Returns true if the elements match elementwise, within tolerance.
  */
-static inline bool check_results_cf32(const char *name, const float *result,
-                                      const float *expected, uint32_t n,
+static inline bool check_results_cf32(const char *name, const float32_t *result,
+                                      const float32_t *expected, uint32_t n,
                                       uint32_t op_count = 400) {
   bool passed = true;
-  float max_error = 0;
-  float diff_at_max_error = 0;
-  float max_diff = 0;
-  float error_at_max_diff = 0;
+  float32_t max_error = 0;
+  float32_t diff_at_max_error = 0;
+  float32_t max_diff = 0;
+  float32_t error_at_max_diff = 0;
 
-  float relative_tol = op_count * std::numeric_limits<float>::epsilon();
+  float32_t relative_tol = op_count * std::numeric_limits<float32_t>::epsilon();
 
   // This is an arbitrarily chosen constant.
   // In the future, we would like to tighten the error bounds, which requires
   // problem-specific information, as well as restrictions on input values and
   // sizes.
-  float abs_tol = 0.000015;
+  float32_t abs_tol = 0.000015;
 
   for (uint32_t i = 0; i < n; ++i) {
-    float diff = fabs(result[i] - expected[i]);
-    float err = expected[i] != 0 ? fabs(diff / expected[i]) : fabs(result[i]);
+    float32_t diff = fabs(result[i] - expected[i]);
+    float32_t err =
+        expected[i] != 0 ? fabs(diff / expected[i]) : fabs(result[i]);
     if (err > max_error) {
       max_error = err;
       diff_at_max_error = diff;
@@ -262,6 +263,6 @@ static inline bool check_results_cf32(const char *name,
                                       const armral_cmplx_f32_t *result,
                                       const armral_cmplx_f32_t *expected,
                                       uint32_t n, uint32_t op_count = 400) {
-  return check_results_cf32(name, (const float *)result,
-                            (const float *)expected, n * 2, op_count);
+  return check_results_cf32(name, (const float32_t *)result,
+                            (const float32_t *)expected, n * 2, op_count);
 }
diff --git a/utils/cs16_utils.hpp b/utils/cs16_utils.hpp
index 6824f3cb75b918b4f521cd3fc8790eb25e364cfa..e4794d65b9fa5cc767e985837a46dc86f18511bd 100644
--- a/utils/cs16_utils.hpp
+++ b/utils/cs16_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/fft_utils.hpp b/utils/fft_utils.hpp
index c99371ccf3a8825a947e279646ecf889db77a899..c34e2592d7f4310b9ed84ee487566eb52e67cda5 100644
--- a/utils/fft_utils.hpp
+++ b/utils/fft_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/int8_utils.hpp b/utils/int8_utils.hpp
index ec5d103c52a1398cfb360e293b8f4fbfb910d6c6..bc5bbdc4847ee680e94a8144a4bdd28207f7135d 100644
--- a/utils/int8_utils.hpp
+++ b/utils/int8_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/matrix_utils.hpp b/utils/matrix_utils.hpp
index d15e31ec8985a85b5fd395b9739a50508722c856..e3a5d0ccaf810959ce78a07f50d8d3d27aab8b01 100644
--- a/utils/matrix_utils.hpp
+++ b/utils/matrix_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -14,13 +14,13 @@
  */
 static inline std::vector<armral_cmplx_f32_t>
 allocate_random_cf32_lin_ind(uint32_t len) {
-  static linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({42});
+  static armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({42});
 
   std::vector<armral_cmplx_f32_t> ret(len);
   for (uint32_t i = 0; i < len; ++i) {
-    ret[i].re = lcg.one<float>(&state, -100., 100.);
-    ret[i].im = lcg.one<float>(&state, -100., 100.);
+    ret[i].re = lcg.one<float32_t>(&state, -100., 100.);
+    ret[i].im = lcg.one<float32_t>(&state, -100., 100.);
   }
   return ret;
 }
@@ -29,8 +29,8 @@ allocate_random_cf32_lin_ind(uint32_t len) {
  * Generate random invertible matrices.
  */
 static inline std::vector<armral_cmplx_f32_t>
-gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
-                      float scale_im = 1.0F) {
+gen_invertible_matrix(uint32_t m, float32_t scale_re = 1.0F,
+                      float32_t scale_im = 1.0F) {
 
   auto a = allocate_random_cf32_lin_ind(m * m);
 
@@ -38,15 +38,16 @@ gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
   // It is non-singular with high probability by virtue of sampling randomly.
 
   // If real-part is zeroed-out increase fac to avoid det(a)=0
-  float fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
+  float32_t fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
 
   for (unsigned i = 0; i < m; ++i) {
     // force non-negative diagonal entries
     a[i * m + i].re = std::abs(a[i * m + i].re);
     for (unsigned j = 0; j < m; ++j) {
       if (i != j) {
-        a[i * m + i].re += fac * std::abs(std::complex<float>(a[i * m + j].re,
-                                                              a[i * m + j].im));
+        a[i * m + i].re +=
+            fac *
+            std::abs(std::complex<float32_t>(a[i * m + j].re, a[i * m + j].im));
       }
     }
   }
@@ -58,7 +59,8 @@ gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
  */
 static inline std::vector<armral_cmplx_f32_t>
 gen_invertible_matrix_batch(uint32_t batch_size, uint32_t m,
-                            float scale_re = 1.0F, float scale_im = 1.0F) {
+                            float32_t scale_re = 1.0F,
+                            float32_t scale_im = 1.0F) {
 
   // Generate batch of matrices
   std::vector<armral_cmplx_f32_t> a(batch_size * m * m);
@@ -77,8 +79,8 @@ gen_invertible_matrix_batch(uint32_t batch_size, uint32_t m,
  * definiteness)
  */
 static inline std::vector<armral_cmplx_f32_t>
-gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
-                     float scale_im = 1.0F, bool perf = false) {
+gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float32_t scale_re = 1.0F,
+                     float32_t scale_im = 1.0F, bool perf = false) {
 
   auto a = perf ? std::vector<armral_cmplx_f32_t>(m * m)
                 : allocate_random_cf32_lin_ind(m * m);
@@ -122,16 +124,15 @@ gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
     // virtue of sampling randomly
 
     // If real-part is zeroed-out increase fac to avoid det(a)=0
-    float fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
+    float32_t fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
 
     for (unsigned i = 0; i < m; ++i) {
       // force non-negative diagonal entries
       a[i * m + i].re = std::abs(a[i * m + i].re);
       for (unsigned j = 0; j < m; ++j) {
         if (i != j) {
-          a[i * m + i].re +=
-              fac *
-              std::abs(std::complex<float>(a[i * m + j].re, a[i * m + j].im));
+          a[i * m + i].re += fac * std::abs(std::complex<float32_t>(
+                                       a[i * m + j].re, a[i * m + j].im));
         }
       }
     }
@@ -146,7 +147,7 @@ gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
  */
 static inline std::vector<armral_cmplx_f32_t>
 gen_hermitian_matrix_batch(uint32_t batch_size, uint32_t m, bool is_hpd = false,
-                           float scale_re = 1.0F, float scale_im = 1.0F,
+                           float32_t scale_re = 1.0F, float32_t scale_im = 1.0F,
                            bool perf = false) {
 
   // Generate batch of matrices
@@ -165,23 +166,23 @@ gen_hermitian_matrix_batch(uint32_t batch_size, uint32_t m, bool is_hpd = false,
  * Function to print check results of matrix inversion UTs.
  */
 static bool check_results_mat_inv(
-    const std::string &name, const float *result, const float *expected,
+    const std::string &name, const float32_t *result, const float32_t *expected,
     const uint32_t n_values, /*n_values = 2 * nSamples, due to RE and IM part)*/
-    const float rel_tol_mult = 1.0F, const float abs_tol_mult = 1.0F,
+    const float32_t rel_tol_mult = 1.0F, const float32_t abs_tol_mult = 1.0F,
     int verbose = 0) {
   bool passed = true;
-  float error = 0;
-  float max_error = 0;
+  float32_t error = 0;
+  float32_t max_error = 0;
   // TODO: arbitrarily chosen constant. we should probably do better than this,
   //       but until we actually talk to people and get an idea of acceptable
   //       tolerances then there's not much point in being too exact here.
-  float relative_tol = 0.00001;  // 10^-5
-  float diff_tolerance = 0.0001; // 10^-4
+  float32_t relative_tol = 0.00001;  // 10^-5
+  float32_t diff_tolerance = 0.0001; // 10^-4
   relative_tol *= rel_tol_mult;
   diff_tolerance *= abs_tol_mult;
 
   for (uint32_t i = 0; i < n_values; ++i) {
-    float diff_abs = fabs(result[i] - expected[i]);
+    float32_t diff_abs = fabs(result[i] - expected[i]);
     error = (expected[i] != 0) ? fabs(diff_abs / expected[i]) : fabs(result[i]);
     max_error = std::max(error, max_error);
 
@@ -267,18 +268,18 @@ static inline bool check_results_identity(const armral_cmplx_f32_t *mat,
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, m64, inv_m64, 0.0F, mm64);
     convert_vector_to_cf32_array(m * m, mm64, mm.data());
-    passed &= check_results_mat_inv("MM^{-1} - Id", (float *)mm.data(),
-                                    (float *)id.data(), 2 * m * m, (float)m,
-                                    (float)m, verbose);
+    passed &= check_results_mat_inv("MM^{-1} - Id", (float32_t *)mm.data(),
+                                    (float32_t *)id.data(), 2 * m * m,
+                                    (float32_t)m, (float32_t)m, verbose);
   }
   // MM^{-1}
   {
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, inv_m64, m64, 0.0F, mm64);
     convert_vector_to_cf32_array(m * m, mm64, mm.data());
-    passed &= check_results_mat_inv("M^{-1}M - Id", (float *)mm.data(),
-                                    (float *)id.data(), 2 * m * m, (float)m,
-                                    (float)m, verbose);
+    passed &= check_results_mat_inv("M^{-1}M - Id", (float32_t *)mm.data(),
+                                    (float32_t *)id.data(), 2 * m * m,
+                                    (float32_t)m, (float32_t)m, verbose);
   }
   return passed;
 }
diff --git a/utils/qint64.hpp b/utils/qint64.hpp
index 8922edf5d2c174ba35d27bebfccff385b6b91b98..02ed5b0b661060e59fd6a9c6695376bc44a48d45 100644
--- a/utils/qint64.hpp
+++ b/utils/qint64.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/reference_linalg.hpp b/utils/reference_linalg.hpp
index 0960d440ef3843528436fc30658e76e89e7dfa57..605b3db69415d9b03c875dba3eda32b1f1c0f387 100644
--- a/utils/reference_linalg.hpp
+++ b/utils/reference_linalg.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -257,8 +257,8 @@ std::complex<T> complex_convert(armral_cmplx_f32_t cmplx) {
 
 template<typename T>
 armral_cmplx_f32_t complex_convert(std::complex<T> cmplx) {
-  return armral_cmplx_f32_t{static_cast<float>(cmplx.real()),
-                            static_cast<float>(cmplx.imag())};
+  return armral_cmplx_f32_t{static_cast<float32_t>(cmplx.real()),
+                            static_cast<float32_t>(cmplx.imag())};
 }
 
 /*
@@ -279,7 +279,7 @@ void convert_vector_to_cf32_array(uint16_t nvalues,
                                   const std::vector<std::complex<T>> &a,
                                   armral_cmplx_f32_t *b) {
   for (unsigned i = 0; i < nvalues; ++i) {
-    b[i] = {(float)a[i].real(), (float)a[i].imag()};
+    b[i] = {(float32_t)a[i].real(), (float32_t)a[i].imag()};
   }
 }
 
diff --git a/utils/rng.cpp b/utils/rng.cpp
index e14d97d04617c89840ac0e22e542411c7d066972..7904bc0422234e32cf0f7049b217f8ae0f2fdd1b 100644
--- a/utils/rng.cpp
+++ b/utils/rng.cpp
@@ -1,10 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rng.hpp"
 
+#include <arm_neon.h>
+
+namespace armral::utils {
+
 static inline uint64_t lcg_step(uint64_t x) {
   x = (x * 1103515245 + 12345) & 0x7fffffffU;
   return x;
@@ -31,10 +35,10 @@ uint32_t linear_congruential_generator::one<uint32_t>(random_state *state) {
 }
 
 template<>
-float linear_congruential_generator::one<float>(random_state *state) {
+float32_t linear_congruential_generator::one<float32_t>(random_state *state) {
   auto x = lcg_step(state->seed);
   state->seed = x;
-  return (float)x / 0x80000000U;
+  return (float32_t)x / 0x80000000U;
 }
 
 template<>
@@ -64,3 +68,5 @@ random_state::from_seeds(const std::initializer_list<uint64_t> seeds) {
   lcg.advance_state(&state, 3);
   return state;
 }
+
+} // namespace armral::utils
diff --git a/utils/rng.hpp b/utils/rng.hpp
index a6e09dc48d74d226a90a72ee64adcbcda32ac377..fb129b017670ee56351dd658c6ce8b8185b41c55 100644
--- a/utils/rng.hpp
+++ b/utils/rng.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
@@ -9,6 +9,8 @@
 #include <initializer_list>
 #include <type_traits>
 
+namespace armral::utils {
+
 struct random_state;
 
 class linear_congruential_generator {
@@ -77,3 +79,5 @@ struct random_state {
    */
   static random_state from_seeds(std::initializer_list<uint64_t> seeds);
 };
+
+} // namespace armral::utils