From 03dcc80e97eee6f2a3dd378025fa98077340846e Mon Sep 17 00:00:00 2001
From: Nick Dingle <nick.dingle@arm.com>
Date: Tue, 16 Apr 2024 09:20:22 +0000
Subject: [PATCH 1/2] Release 24.04

Co-Authored-By: Rosie Sumpter <rosie.sumpter@arm.com>
Co-Authored-By: Ryo Suzuki <ryo.suzuki@arm.com>
---
 .gitlab/merge_request_templates/Bugfix.md     |   1 +
 .gitlab/merge_request_templates/Default.md    |   1 +
 .../merge_request_templates/Documentation.md  |   1 +
 CHANGELOG.md                                  |  29 +-
 CMakeLists.txt                                | 971 +++++++++++-------
 CONTRIBUTING.md                               | 189 ++--
 CREDITS.md                                    |  14 +-
 Doxyfile.in                                   |   2 +-
 LICENSE.md                                    |   2 +-
 README.md                                     |  78 +-
 RELEASE_NOTES.md                              | 179 ++--
 .../Batch/GeneralMatInv/NonPA/bench.py        |   2 +-
 .../Batch/GeneralMatInv/NonPA/main.cpp        |   2 +-
 .../MatrixInv/Batch/GeneralMatInv/PA/bench.py |   2 +-
 .../MatrixInv/Batch/GeneralMatInv/PA/main.cpp |   2 +-
 .../Batch/HermitianMatInv/NonPA/bench.py      |   2 +-
 .../Batch/HermitianMatInv/NonPA/main.cpp      |   2 +-
 .../Batch/HermitianMatInv/PA/bench.py         |   2 +-
 .../Batch/HermitianMatInv/PA/main.cpp         |   2 +-
 .../MatrixInv/Single/GeneralMatInv/bench.py   |   2 +-
 .../MatrixInv/Single/GeneralMatInv/main.cpp   |   2 +-
 .../MatrixInv/Single/HermitianMatInv/bench.py |   2 +-
 .../MatrixInv/Single/HermitianMatInv/main.cpp |   2 +-
 .../MatrixMult/Batch/ArmSolve/1x2/bench.py    |   2 +-
 .../MatrixMult/Batch/ArmSolve/1x2/main.cpp    |   2 +-
 .../MatrixMult/Batch/ArmSolve/1x4/bench.py    |   2 +-
 .../MatrixMult/Batch/ArmSolve/1x4/main.cpp    |   2 +-
 .../MatrixMult/Batch/ArmSolve/2x2/bench.py    |   2 +-
 .../MatrixMult/Batch/ArmSolve/2x2/main.cpp    |   2 +-
 .../MatrixMult/Batch/ArmSolve/2x4/bench.py    |   2 +-
 .../MatrixMult/Batch/ArmSolve/2x4/main.cpp    |   2 +-
 .../MatrixMult/Batch/ArmSolve/4x4/bench.py    |   2 +-
 .../MatrixMult/Batch/ArmSolve/4x4/main.cpp    |   2 +-
 .../MatrixVectorMult16/32b/NonPA/bench.py     |   2 +-
 .../MatrixVectorMult16/32b/NonPA/main.cpp     |   2 +-
 .../Batch/MatrixVectorMult16/32b/PA/bench.py  |   2 +-
 .../Batch/MatrixVectorMult16/32b/PA/main.cpp  |   2 +-
 .../MatrixVectorMult16/64b/NonPA/bench.py     |   2 +-
 .../MatrixVectorMult16/64b/NonPA/main.cpp     |   2 +-
 .../Batch/MatrixVectorMult16/64b/PA/bench.py  |   2 +-
 .../Batch/MatrixVectorMult16/64b/PA/main.cpp  |   2 +-
 .../Batch/MatrixVectorMult32/NonPA/bench.py   |   2 +-
 .../Batch/MatrixVectorMult32/NonPA/main.cpp   |   2 +-
 .../Batch/MatrixVectorMult32/PA/bench.py      |   2 +-
 .../Batch/MatrixVectorMult32/PA/main.cpp      |   2 +-
 .../Single/MatrixMult16/32b/bench.py          |   2 +-
 .../Single/MatrixMult16/32b/main.cpp          |   2 +-
 .../Single/MatrixMult16/64b/bench.py          |   2 +-
 .../Single/MatrixMult16/64b/main.cpp          |   2 +-
 .../Single/MatrixMult32/2x2/IQ/bench.py       |   2 +-
 .../Single/MatrixMult32/2x2/IQ/main.cpp       |   2 +-
 .../Single/MatrixMult32/2x2/NonIQ/bench.py    |   2 +-
 .../Single/MatrixMult32/2x2/NonIQ/main.cpp    |   2 +-
 .../Single/MatrixMult32/4x4/IQ/bench.py       |   2 +-
 .../Single/MatrixMult32/4x4/IQ/main.cpp       |   2 +-
 .../Single/MatrixMult32/4x4/NonIQ/bench.py    |   2 +-
 .../Single/MatrixMult32/4x4/NonIQ/main.cpp    |   2 +-
 .../Single/MatrixMult32/general/bench.py      |   2 +-
 .../Single/MatrixMult32/general/main.cpp      |   2 +-
 .../Single/MatrixMultAAH32/bench.py           |   2 +-
 .../Single/MatrixMultAAH32/main.cpp           |   2 +-
 .../Single/MatrixMultAHB32/bench.py           |   2 +-
 .../Single/MatrixMultAHB32/main.cpp           |   2 +-
 .../Single/MatrixVectorMult16/32bit/bench.py  |   2 +-
 .../Single/MatrixVectorMult16/32bit/main.cpp  |   2 +-
 .../Single/MatrixVectorMult16/64bit/bench.py  |   2 +-
 .../Single/MatrixVectorMult16/64bit/main.cpp  |   2 +-
 .../Single/MatrixVectorMult32/bench.py        |   2 +-
 .../Single/MatrixVectorMult32/main.cpp        |   2 +-
 .../MatrixPseudoInv/Direct/bench.py           |   4 +-
 .../MatrixPseudoInv/Direct/main.cpp           |   2 +-
 .../VectorDotProd/VecDot16/bench.py           |   2 +-
 .../VectorDotProd/VecDot16/main.cpp           |   2 +-
 .../VectorDotProd/VecDot16_2/bench.py         |   2 +-
 .../VectorDotProd/VecDot16_2/main.cpp         |   2 +-
 .../VectorDotProd/VecDot16_2_32bit/bench.py   |   2 +-
 .../VectorDotProd/VecDot16_2_32bit/main.cpp   |   2 +-
 .../VectorDotProd/VecDot16_32bit/bench.py     |   2 +-
 .../VectorDotProd/VecDot16_32bit/main.cpp     |   2 +-
 .../VectorDotProd/VecDot32/bench.py           |   2 +-
 .../VectorDotProd/VecDot32/main.cpp           |   2 +-
 .../VectorDotProd/VecDot32_2/bench.py         |   2 +-
 .../VectorDotProd/VecDot32_2/main.cpp         |  14 +-
 .../VectorMult}/VecMul16/bench.py             |   2 +-
 .../VectorMult}/VecMul16/main.cpp             |   2 +-
 .../VectorMult}/VecMul16_2/bench.py           |   2 +-
 .../VectorMult}/VecMul16_2/main.cpp           |   2 +-
 .../VectorMult}/VecMul32/bench.py             |   2 +-
 .../VectorMult}/VecMul32/main.cpp             |   2 +-
 .../VectorMult}/VecMul32_2/bench.py           |   2 +-
 .../VectorMult}/VecMul32_2/main.cpp           |  14 +-
 .../MuLaw/Compression/14bit/bench.py          |   2 +-
 .../MuLaw/Compression/14bit/main.cpp          |   2 +-
 .../MuLaw/Compression/8bit/bench.py           |   2 +-
 .../MuLaw/Compression/8bit/main.cpp           |   2 +-
 .../MuLaw/Compression/9bit/bench.py           |   2 +-
 .../MuLaw/Compression/9bit/main.cpp           |   2 +-
 .../MuLaw/Decompression/14bit/bench.py        |   2 +-
 .../MuLaw/Decompression/14bit/main.cpp        |   2 +-
 .../MuLaw/Decompression/8bit/bench.py         |   2 +-
 .../MuLaw/Decompression/8bit/main.cpp         |   2 +-
 .../MuLaw/Decompression/9bit/bench.py         |   2 +-
 .../MuLaw/Decompression/9bit/main.cpp         |   2 +-
 .../Compression/12bit/bench.py                |   2 +-
 .../Compression/12bit/main.cpp                |   2 +-
 .../Compression/14bit/bench.py                |   2 +-
 .../Compression/14bit/main.cpp                |   2 +-
 .../ORanBlockFloat}/Compression/8bit/bench.py |   2 +-
 .../ORanBlockFloat}/Compression/8bit/main.cpp |   2 +-
 .../ORanBlockFloat}/Compression/9bit/bench.py |   2 +-
 .../ORanBlockFloat}/Compression/9bit/main.cpp |   2 +-
 .../Decompression/12bit/bench.py              |   2 +-
 .../Decompression/12bit/main.cpp              |   2 +-
 .../Decompression/14bit/bench.py              |   2 +-
 .../Decompression/14bit/main.cpp              |   2 +-
 .../Decompression/8bit/bench.py               |   2 +-
 .../Decompression/8bit/main.cpp               |   2 +-
 .../Decompression/9bit/bench.py               |   2 +-
 .../Decompression/9bit/main.cpp               |   2 +-
 .../Compression/14bit/bench.py                |   2 +-
 .../Compression/14bit/main.cpp                |   2 +-
 .../Compression/8bit/bench.py                 |   2 +-
 .../Compression/8bit/main.cpp                 |   2 +-
 .../Compression/9bit/bench.py                 |   2 +-
 .../Compression/9bit/main.cpp                 |   2 +-
 .../Decompression/14bit/bench.py              |   2 +-
 .../Decompression/14bit/main.cpp              |   2 +-
 .../Decompression/8bit/bench.py               |   2 +-
 .../Decompression/8bit/main.cpp               |   2 +-
 .../Decompression/9bit/bench.py               |   2 +-
 .../Decompression/9bit/main.cpp               |   2 +-
 bench/{ => LowerPHY}/Correlation/bench.py     |   2 +-
 bench/{ => LowerPHY}/Correlation/main.cpp     |   2 +-
 bench/{ => LowerPHY}/FFT/FFT16/bench.py       |   2 +-
 bench/{ => LowerPHY}/FFT/FFT16/main.cpp       |   2 +-
 bench/{ => LowerPHY}/FFT/FFT32/bench.py       |   2 +-
 bench/{ => LowerPHY}/FFT/FFT32/main.cpp       |   2 +-
 bench/{ => LowerPHY}/FIR/FIR16/bench.py       |   2 +-
 bench/{ => LowerPHY}/FIR/FIR16/main.cpp       |   2 +-
 .../FIR/FIR16Decimate2/bench.py               |   2 +-
 .../FIR/FIR16Decimate2/main.cpp               |   2 +-
 bench/{ => LowerPHY}/FIR/FIR32/bench.py       |   2 +-
 bench/{ => LowerPHY}/FIR/FIR32/main.cpp       |   2 +-
 .../FIR/FIR32Decimate2/bench.py               |   2 +-
 .../FIR/FIR32Decimate2/main.cpp               |   2 +-
 bench/{ => LowerPHY}/Scrambling/bench.py      |   2 +-
 bench/{ => LowerPHY}/Scrambling/main.cpp      |   2 +-
 bench/{ => LowerPHY}/SeqGenerator/bench.py    |   2 +-
 bench/{ => LowerPHY}/SeqGenerator/main.cpp    |   2 +-
 bench/{ => MatrixFactorizations}/SVD/bench.py |   2 +-
 bench/{ => MatrixFactorizations}/SVD/main.cpp |   6 +-
 .../{ => UpperPHY}/CRC/11/BigEndian/bench.py  |   2 +-
 .../{ => UpperPHY}/CRC/11/BigEndian/main.cpp  |   2 +-
 .../CRC/11/LittleEndian/bench.py              |   2 +-
 .../CRC/11/LittleEndian/main.cpp              |   2 +-
 .../{ => UpperPHY}/CRC/16/BigEndian/bench.py  |   2 +-
 .../{ => UpperPHY}/CRC/16/BigEndian/main.cpp  |   2 +-
 .../CRC/16/LittleEndian/bench.py              |   2 +-
 .../CRC/16/LittleEndian/main.cpp              |   2 +-
 .../CRC/24/A/BigEndian/bench.py               |   2 +-
 .../CRC/24/A/BigEndian/main.cpp               |   2 +-
 .../CRC/24/A/LittleEndian/bench.py            |   2 +-
 .../CRC/24/A/LittleEndian/main.cpp            |   2 +-
 .../CRC/24/B/BigEndian/bench.py               |   2 +-
 .../CRC/24/B/BigEndian/main.cpp               |   2 +-
 .../CRC/24/B/LittleEndian/bench.py            |   2 +-
 .../CRC/24/B/LittleEndian/main.cpp            |   2 +-
 .../CRC/24/C/BigEndian/bench.py               |   2 +-
 .../CRC/24/C/BigEndian/main.cpp               |   2 +-
 .../CRC/24/C/LittleEndian/bench.py            |   2 +-
 .../CRC/24/C/LittleEndian/main.cpp            |   2 +-
 bench/{ => UpperPHY}/CRC/6/BigEndian/bench.py |   2 +-
 bench/{ => UpperPHY}/CRC/6/BigEndian/main.cpp |   2 +-
 .../CRC/6/LittleEndian/bench.py               |   2 +-
 .../CRC/6/LittleEndian/main.cpp               |   2 +-
 .../ConvolutionalDecoder}/bench.py            |   2 +-
 .../ConvolutionalDecoder}/main.cpp            |   2 +-
 .../ConvolutionalEncoder}/bench.py            |   2 +-
 .../ConvolutionalEncoder}/main.cpp            |   2 +-
 bench/{ => UpperPHY}/Demodulation/bench.py    |   2 +-
 bench/{ => UpperPHY}/Demodulation/main.cpp    |   2 +-
 bench/{ => UpperPHY}/LDPC/Decoding/bench.py   |   2 +-
 bench/{ => UpperPHY}/LDPC/Decoding/main.cpp   |   2 +-
 bench/{ => UpperPHY}/LDPC/Encoding/bench.py   |   2 +-
 bench/{ => UpperPHY}/LDPC/Encoding/main.cpp   |   2 +-
 .../{ => UpperPHY}/LDPC/RateMatching/bench.py |   2 +-
 .../{ => UpperPHY}/LDPC/RateMatching/main.cpp |   2 +-
 .../{ => UpperPHY}/LDPC/RateRecovery/bench.py |   2 +-
 .../{ => UpperPHY}/LDPC/RateRecovery/main.cpp |   2 +-
 bench/{ => UpperPHY}/Modulation/bench.py      |   2 +-
 bench/{ => UpperPHY}/Modulation/main.cpp      |   2 +-
 bench/{ => UpperPHY}/Polar/Decoding/bench.py  |   2 +-
 bench/{ => UpperPHY}/Polar/Decoding/main.cpp  |   6 +-
 bench/{ => UpperPHY}/Polar/Encoding/bench.py  |   2 +-
 bench/{ => UpperPHY}/Polar/Encoding/main.cpp  |   2 +-
 bench/{ => UpperPHY}/Polar/Frozen/bench.py    |   2 +-
 bench/{ => UpperPHY}/Polar/Frozen/main.cpp    |   2 +-
 .../Polar/RateMatching/bench.py               |   2 +-
 .../Polar/RateMatching/main.cpp               |   2 +-
 .../Polar/RateRecovery/bench.py               |   2 +-
 .../Polar/RateRecovery/main.cpp               |   2 +-
 .../Polar/SubchannelDeinterleave/bench.py     |   2 +-
 .../Polar/SubchannelDeinterleave/main.cpp     |   2 +-
 .../Polar/SubchannelInterleave/bench.py       |   2 +-
 .../Polar/SubchannelInterleave/main.cpp       |   2 +-
 bench/{ => UpperPHY}/Turbo/Decoding/bench.py  |   2 +-
 bench/{ => UpperPHY}/Turbo/Decoding/main.cpp  |   6 +-
 bench/{ => UpperPHY}/Turbo/Encoding/bench.py  |   2 +-
 bench/{ => UpperPHY}/Turbo/Encoding/main.cpp  |   2 +-
 .../Turbo/RateMatching/bench.py               |   2 +-
 .../Turbo/RateMatching/main.cpp               |   2 +-
 .../Turbo/RateRecovery/bench.py               |   2 +-
 .../Turbo/RateRecovery/main.cpp               |   2 +-
 bench/benchmarker.py                          |   4 +-
 bench/benchmarker_utils.py                    |  10 +-
 bench/default_runner.py                       |   2 +-
 docs/doxywrapper/arm_infra_html.css           |   2 +-
 docs/examples.md                              |  23 +-
 docs/frontmatter.md                           |  47 +-
 examples/block_float_9b_example.c             |   2 +-
 examples/fft_cf32_example.c                   |   4 +-
 examples/modulation_example.c                 |   2 +-
 examples/polar_example.cpp                    |   2 +-
 include/armral.h                              |  44 +-
 python/benchmark_excel_summary.py             | 133 +++
 python/requirements.txt                       |   2 +
 simulation/CMakeLists.txt                     | 147 ++-
 simulation/README.md                          |  51 +-
 simulation/awgn/CMakeLists.txt                |  16 +-
 simulation/awgn/awgn.cpp                      |  21 +-
 simulation/awgn/{awgn.h => awgn.hpp}          |  11 +-
 simulation/capacity/capacity.py               |   2 +-
 simulation/convolutional_awgn/CMakeLists.txt  |  35 -
 .../convolutional_awgn/convolutional_awgn.cpp |  42 +-
 .../convolutional_error_rate.py               |   2 +-
 simulation/include/simulation_common.hpp      |   6 +-
 simulation/include/simulation_common.py       |   2 +-
 simulation/ldpc_awgn/CMakeLists.txt           |  35 -
 simulation/ldpc_awgn/ldpc_awgn.cpp            |  38 +-
 simulation/ldpc_awgn/ldpc_error_rate.py       |   2 +-
 simulation/modulation_awgn/CMakeLists.txt     |  35 -
 .../modulation_awgn/modulation_awgn.cpp       |  34 +-
 .../modulation_awgn/modulation_error_rate.py  |   2 +-
 simulation/polar_awgn/CMakeLists.txt          |  35 -
 simulation/polar_awgn/polar_awgn.cpp          |  35 +-
 simulation/polar_awgn/polar_error_rate.py     |   2 +-
 simulation/turbo_awgn/CMakeLists.txt          |  35 -
 simulation/turbo_awgn/turbo_awgn.cpp          |  32 +-
 simulation/turbo_awgn/turbo_error_rate.py     |   2 +-
 .../arm_cmplx_hermitian_mat_inversion_f32.cpp | 142 +--
 .../MatrixInv/arm_cmplx_mat_inversion_f32.cpp |   4 +-
 .../cmplx_hermitian_mat_inversion_f32.hpp     |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_ahb_f32.c   | 111 +-
 .../MatrixMult/arm_cmplx_mat_mult_f32.c       |  24 +-
 .../MatrixMult/arm_cmplx_mat_mult_i16.c       |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_i16_32bit.c |   2 +-
 .../MatrixMult/arm_cmplx_mat_vec_mult_f32.c   |  96 +-
 .../MatrixMult/arm_cmplx_mat_vec_mult_i16.c   |  14 +-
 .../arm_cmplx_mat_vec_mult_i16_32bit.c        |  18 +-
 src/BasicMathFun/MatrixMult/arm_solve_1sc.c   |  27 +-
 src/BasicMathFun/MatrixMult/arm_solve_1sc.h   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_4sc.c   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_4sc.h   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_6sc.c   |   4 +-
 src/BasicMathFun/MatrixMult/arm_solve_6sc.h   |   2 +-
 .../MatrixMult/arm_solve_convert.h            |  14 +-
 src/BasicMathFun/MatrixMult/arm_solve_f32.c   |   2 +-
 .../arm_cmplx_pseudo_inverse_direct_f32.cpp   |  30 +-
 .../cmplx_mat_pseudo_inverse.hpp              |   2 +-
 .../VectorDotProd/arm_cmplx_vecdot_f32.c      |  14 +-
 .../VectorDotProd/arm_cmplx_vecdot_f32_2.c    |  13 +-
 .../VectorDotProd/arm_cmplx_vecdot_i16.c      |   2 +-
 .../VectorDotProd/arm_cmplx_vecdot_i16_2.c    |   2 +-
 .../arm_cmplx_vecdot_i16_2_32bit.c            |   2 +-
 .../arm_cmplx_vecdot_i16_32bit.c              |   2 +-
 .../VectorMult/arm_cmplx_vecmul_f32.c         |  28 +-
 .../VectorMult/arm_cmplx_vecmul_f32_2.c       |  21 +-
 .../VectorMult/arm_cmplx_vecmul_i16.cpp       |   2 +-
 .../VectorMult/arm_cmplx_vecmul_i16_2.c       |   2 +-
 .../arm_mu_law_compression.cpp                |   2 +-
 .../arm_mu_law_decompression.cpp              | 141 +--
 .../arm_block_float_compression.cpp           |   2 +-
 .../arm_block_float_decompression.cpp         |  57 +-
 .../arm_block_scaling_compression.cpp         |   2 +-
 .../arm_block_scaling_decompression.cpp       |  60 +-
 src/DuRuInterface/bit_packing_common.hpp      |   2 +-
 src/DuRuInterface/bit_unpacking_common.hpp    | 185 ++++
 src/LowerPHY/Correlation/arm_correlation.c    |   2 +-
 src/LowerPHY/FFT/fft_cf32.cpp                 |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_kernel_lookup.c     |   2 +-
 src/LowerPHY/FFT/fft_cf32_kernel_lookup.h     |   2 +-
 src/LowerPHY/FFT/fft_cs16.cpp                 |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cs16_kernel_lookup.c     |   2 +-
 src/LowerPHY/FFT/fft_cs16_kernel_lookup.h     |   2 +-
 src/LowerPHY/FFT/fft_execute.cpp              |   2 +-
 src/LowerPHY/FFT/fft_execute.hpp              |   2 +-
 src/LowerPHY/FFT/fft_helper.h                 |   2 +-
 src/LowerPHY/FFT/fft_level.cpp                |   2 +-
 src/LowerPHY/FFT/fft_level.hpp                |   2 +-
 src/LowerPHY/FFT/fft_plan.cpp                 |   4 +-
 src/LowerPHY/FFT/fft_plan.hpp                 |   2 +-
 src/LowerPHY/FFT/fft_types.hpp                |   2 +-
 src/LowerPHY/FFT/rader.cpp                    |   2 +-
 src/LowerPHY/FFT/rader.hpp                    |   2 +-
 src/LowerPHY/FFT/rader_generator.cpp          |   2 +-
 src/LowerPHY/FFT/rader_generator.hpp          |   2 +-
 src/LowerPHY/FIR/arm_fir_filter_cf32.c        |  25 +-
 .../FIR/arm_fir_filter_cf32_decimate_2.c      |  40 +-
 src/LowerPHY/FIR/arm_fir_filter_cs16.c        |   2 +-
 .../FIR/arm_fir_filter_cs16_decimate_2.c      |   4 +-
 src/LowerPHY/Scrambling/arm_scrambling.cpp    |   2 +-
 .../SeqGenerator/arm_mat_seq_generator.cpp    |   2 +-
 .../SVD/arm_svd.cpp                           | 134 +--
 .../SVD/matrix_view.hpp                       |   2 +-
 src/UpperPHY/CRC/arm_crc11.cpp                |   2 +-
 src/UpperPHY/CRC/arm_crc16.cpp                |   2 +-
 src/UpperPHY/CRC/arm_crc24_a.cpp              |   2 +-
 src/UpperPHY/CRC/arm_crc24_b.cpp              |   2 +-
 src/UpperPHY/CRC/arm_crc24_c.cpp              |   2 +-
 src/UpperPHY/CRC/arm_crc6.cpp                 |   2 +-
 src/UpperPHY/CRC/crc_basic.hpp                |   2 +-
 src/UpperPHY/CRC/crc_common.hpp               |   8 +-
 .../arm_convolutional_decoder.cpp             |   4 +-
 .../arm_convolutional_encoder.cpp             |   2 +-
 .../convolutional_code_table.hpp              |   2 +-
 src/UpperPHY/Demodulation/arm_demodulation.c  |   4 +-
 src/UpperPHY/LDPC/ldpc_coding.hpp             |   2 +-
 src/UpperPHY/LDPC/ldpc_decoder.cpp            |   4 +-
 src/UpperPHY/LDPC/ldpc_encoder.cpp            | 606 ++++++++++-
 src/UpperPHY/LDPC/ldpc_rate_common.hpp        |   2 +-
 src/UpperPHY/LDPC/ldpc_rate_matching.cpp      |   2 +-
 src/UpperPHY/LDPC/ldpc_rate_recovery.cpp      |   2 +-
 src/UpperPHY/Modulation/arm_modulation.c      |   2 +-
 .../Polar/arm_polar_crc_attachment.cpp        |   2 +-
 src/UpperPHY/Polar/arm_polar_crc_check.cpp    |   2 +-
 src/UpperPHY/Polar/arm_polar_decoder.cpp      |   4 +-
 src/UpperPHY/Polar/arm_polar_decoder.hpp      |   2 +-
 src/UpperPHY/Polar/arm_polar_decoder_neon.hpp |   8 +-
 src/UpperPHY/Polar/arm_polar_encoder.c        |   2 +-
 src/UpperPHY/Polar/arm_polar_frozen_bits.cpp  |   4 +-
 .../Polar/arm_polar_rate_matching.cpp         |   2 +-
 .../Polar/arm_polar_rate_recovery.cpp         |   2 +-
 .../arm_polar_subchannel_deinterleave.cpp     |   2 +-
 .../Polar/arm_polar_subchannel_interleave.cpp |   2 +-
 src/UpperPHY/Turbo/arm_turbo_decoder.cpp      | 549 +---------
 src/UpperPHY/Turbo/arm_turbo_encoder.cpp      |   2 +-
 .../Turbo/arm_turbo_rate_matching.cpp         |   2 +-
 .../Turbo/arm_turbo_rate_recovery.cpp         |   2 +-
 src/UpperPHY/Turbo/turbo_code.hpp             |   2 +-
 src/UpperPHY/Turbo/turbo_decoder_fp16.hpp     | 520 ++++++++++
 src/UpperPHY/Turbo/turbo_decoder_fp32.hpp     | 533 ++++++++++
 src/UpperPHY/Turbo/turbo_tables.hpp           |   2 +-
 src/intrinsics.h                              |   2 +-
 src/utils/allocators.hpp                      |   2 +-
 src/utils/cmplx_arith_f32.hpp                 |   2 +-
 src/utils/vec_mul.hpp                         |   2 +-
 .../MatrixInv/Batch}/main.cpp                 |  43 +-
 .../MatrixInv/Single}/main.cpp                |  22 +-
 .../MatrixMult/Batch}/ArmSolve/main.cpp       |   6 +-
 .../Batch}/MatrixVectorMult16/main.cpp        |   2 +-
 .../Batch}/MatrixVectorMult32/main.cpp        |   2 +-
 .../MatrixMult/Single}/MatrixMult16/main.cpp  |   2 +-
 .../MatrixMult/Single}/MatrixMult32/main.cpp  |  10 +-
 .../Single}/MatrixMultAAH32/main.cpp          |   2 +-
 .../Single}/MatrixMultAHB32/main.cpp          |   2 +-
 .../Single}/MatrixVectorMult16/main.cpp       |   2 +-
 .../Single}/MatrixVectorMult32/main.cpp       |   2 +-
 .../MatrixPseudoInv/Direct}/main.cpp          |  31 +-
 .../VectorDotProd/VecDot16}/main.cpp          |   2 +-
 .../VectorDotProd/VecDot16_2}/main.cpp        |   2 +-
 .../VectorDotProd/VecDot16_2_32bit}/main.cpp  |   2 +-
 .../VectorDotProd/VecDot16_32bit}/main.cpp    |   2 +-
 .../VectorDotProd/VecDot32}/main.cpp          |   4 +-
 .../VectorDotProd/VecDot32_2}/main.cpp        |   4 +-
 .../VectorMult/VecMul16}/main.cpp             |   2 +-
 .../VectorMult/VecMul16_2}/main.cpp           |   2 +-
 .../VectorMult/VecMul32}/main.cpp             |   2 +-
 .../VectorMult/VecMul32_2}/main.cpp           |   2 +-
 .../MuLaw/Compression/main.cpp                |   2 +-
 .../MuLaw/Decompression/main.cpp              |   2 +-
 .../ORanBlockFloat}/Compression/main.cpp      |   2 +-
 .../ORanBlockFloat}/Decompression/main.cpp    |   2 +-
 .../ORanBlockScaling/Compression/main.cpp     |   2 +-
 .../ORanBlockScaling/Decompression/main.cpp   |   2 +-
 test/{ => LowerPHY}/Correlation/main.cpp      |   2 +-
 .../{FFT/cs16 => LowerPHY/FFT/FFT16}/main.cpp |  14 +-
 .../{FFT/cf32 => LowerPHY/FFT/FFT32}/main.cpp |   6 +-
 .../FIR/FIR16}/main.cpp                       |   2 +-
 .../FIR/FIR16Decimate2}/main.cpp              |   2 +-
 .../FIR/FIR32}/main.cpp                       |   2 +-
 .../FIR/FIR32Decimate2}/main.cpp              |   2 +-
 test/{ => LowerPHY}/Scrambling/main.cpp       |   2 +-
 test/{ => LowerPHY}/SeqGenerator/main.cpp     |   2 +-
 test/{ => MatrixFactorizations}/SVD/main.cpp  |  22 +-
 .../SVD/svd_sample_data.h                     |   6 +-
 .../SVD/svd_test.hpp                          | 208 ++--
 test/{ => UpperPHY}/CRC/main.cpp              |   2 +-
 .../ConvolutionalDecoder}/main.cpp            |   2 +-
 .../ConvolutionalEncoder}/main.cpp            |   2 +-
 test/{ => UpperPHY}/Demodulation/main.cpp     |   6 +-
 .../LDPC/Decoding}/main.cpp                   |   2 +-
 .../LDPC/Encoding}/ldpc_encoding_test_data.h  |   2 +-
 .../LDPC/Encoding}/main.cpp                   |   4 +-
 .../LDPC/RateMatching}/main.cpp               |   2 +-
 .../LDPC/RateRecovery}/main.cpp               |   3 +-
 test/{ => UpperPHY}/LDPC/ldpc_test_common.hpp |   2 +-
 test/{ => UpperPHY}/Modulation/main.cpp       |   2 +-
 .../Polar/CrcAttachment}/main.cpp             |   2 +-
 .../CrcAttachment}/polar_crc_attach_data.hpp  |   2 +-
 .../Polar/Decoding}/main.cpp                  |   2 +-
 .../Polar/Encoding}/main.cpp                  |   2 +-
 .../frozen => UpperPHY/Polar/Frozen}/main.cpp |   2 +-
 .../Polar/RateMatching}/main.cpp              |   2 +-
 .../Polar/RateRecovery}/main.cpp              |   2 +-
 .../Polar/SubchannelDeinterleave}/main.cpp    |   2 +-
 .../Polar/SubchannelInterleave}/main.cpp      |   2 +-
 .../Turbo/Decoding}/main.cpp                  |   4 +-
 .../Turbo/Encoding}/main.cpp                  |   4 +-
 .../Encoding}/reference_turbo_encoder.hpp     |   2 +-
 .../Turbo/RateMatching}/main.cpp              |   2 +-
 .../Turbo/RateRecovery}/main.cpp              |   2 +-
 .../RateRecovery}/rate_recovery_data.hpp      |   2 +-
 test/{ => UpperPHY}/Turbo/turbo_test_data.hpp |   6 +-
 utils/bit_utils.hpp                           |   4 +-
 utils/cf32_utils.hpp                          |  61 +-
 utils/cs16_utils.hpp                          |   2 +-
 utils/fft_utils.hpp                           |   2 +-
 utils/int8_utils.hpp                          |   2 +-
 utils/matrix_utils.hpp                        |  63 +-
 utils/qint64.hpp                              |   2 +-
 utils/reference_linalg.hpp                    |   8 +-
 utils/rng.cpp                                 |  12 +-
 utils/rng.hpp                                 |   6 +-
 452 files changed, 4314 insertions(+), 2793 deletions(-)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/GeneralMatInv/PA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/GeneralMatInv/PA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp (93%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/HermitianMatInv/PA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixInv/Batch/HermitianMatInv/PA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixInv/Single/GeneralMatInv/bench.py (82%)
 rename bench/{ => BasicMathFun}/MatrixInv/Single/GeneralMatInv/main.cpp (90%)
 rename bench/{ => BasicMathFun}/MatrixInv/Single/HermitianMatInv/bench.py (82%)
 rename bench/{ => BasicMathFun}/MatrixInv/Single/HermitianMatInv/main.cpp (91%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/1x2/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/1x2/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/1x4/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/1x4/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/2x2/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/2x2/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/2x4/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/2x4/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/4x4/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/ArmSolve/4x4/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp (95%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp (95%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp (94%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py (86%)
 rename bench/{ => BasicMathFun}/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp (95%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult16/32b/bench.py (82%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult16/32b/main.cpp (91%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult16/64b/bench.py (82%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult16/64b/main.cpp (90%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py (79%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp (91%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py (79%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp (89%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py (79%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp (92%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py (79%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp (89%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/general/bench.py (87%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMult32/general/main.cpp (93%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMultAAH32/bench.py (85%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMultAAH32/main.cpp (91%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMultAHB32/bench.py (91%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixMultAHB32/main.cpp (92%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp (92%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py (84%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp (92%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult32/bench.py (87%)
 rename bench/{ => BasicMathFun}/MatrixMult/Single/MatrixVectorMult32/main.cpp (89%)
 rename bench/{ => BasicMathFun}/MatrixPseudoInv/Direct/bench.py (81%)
 rename bench/{ => BasicMathFun}/MatrixPseudoInv/Direct/main.cpp (94%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16/bench.py (82%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16/main.cpp (90%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_2/bench.py (83%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_2/main.cpp (92%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_2_32bit/bench.py (83%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_2_32bit/main.cpp (92%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_32bit/bench.py (83%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot16_32bit/main.cpp (90%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot32/bench.py (82%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot32/main.cpp (90%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot32_2/bench.py (82%)
 rename bench/{ => BasicMathFun}/VectorDotProd/VecDot32_2/main.cpp (75%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul16/bench.py (82%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul16/main.cpp (91%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul16_2/bench.py (82%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul16_2/main.cpp (92%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul32/bench.py (82%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul32/main.cpp (91%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul32_2/bench.py (82%)
 rename bench/{ElemWiseVectorMult => BasicMathFun/VectorMult}/VecMul32_2/main.cpp (74%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/14bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/14bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/8bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/8bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/9bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Compression/9bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/14bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/14bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/8bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/8bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/9bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/MuLaw/Decompression/9bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/12bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/12bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/14bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/14bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/8bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/8bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/9bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/9bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/12bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/12bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/14bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/14bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/8bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/8bit/main.cpp (93%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/9bit/bench.py (87%)
 rename bench/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/9bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/14bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/14bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/8bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/8bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/9bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Compression/9bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/14bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/14bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/8bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/8bit/main.cpp (93%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/9bit/bench.py (87%)
 rename bench/{ => DuRuInterface}/ORanBlockScaling/Decompression/9bit/main.cpp (93%)
 rename bench/{ => LowerPHY}/Correlation/bench.py (82%)
 rename bench/{ => LowerPHY}/Correlation/main.cpp (89%)
 rename bench/{ => LowerPHY}/FFT/FFT16/bench.py (87%)
 rename bench/{ => LowerPHY}/FFT/FFT16/main.cpp (92%)
 rename bench/{ => LowerPHY}/FFT/FFT32/bench.py (87%)
 rename bench/{ => LowerPHY}/FFT/FFT32/main.cpp (92%)
 rename bench/{ => LowerPHY}/FIR/FIR16/bench.py (83%)
 rename bench/{ => LowerPHY}/FIR/FIR16/main.cpp (93%)
 rename bench/{ => LowerPHY}/FIR/FIR16Decimate2/bench.py (84%)
 rename bench/{ => LowerPHY}/FIR/FIR16Decimate2/main.cpp (93%)
 rename bench/{ => LowerPHY}/FIR/FIR32/bench.py (83%)
 rename bench/{ => LowerPHY}/FIR/FIR32/main.cpp (93%)
 rename bench/{ => LowerPHY}/FIR/FIR32Decimate2/bench.py (84%)
 rename bench/{ => LowerPHY}/FIR/FIR32Decimate2/main.cpp (93%)
 rename bench/{ => LowerPHY}/Scrambling/bench.py (84%)
 rename bench/{ => LowerPHY}/Scrambling/main.cpp (89%)
 rename bench/{ => LowerPHY}/SeqGenerator/bench.py (84%)
 rename bench/{ => LowerPHY}/SeqGenerator/main.cpp (86%)
 rename bench/{ => MatrixFactorizations}/SVD/bench.py (86%)
 rename bench/{ => MatrixFactorizations}/SVD/main.cpp (89%)
 rename bench/{ => UpperPHY}/CRC/11/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/11/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/11/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/11/LittleEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/16/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/16/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/16/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/16/LittleEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/A/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/A/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/A/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/A/LittleEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/B/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/B/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/B/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/B/LittleEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/C/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/C/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/24/C/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/24/C/LittleEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/6/BigEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/6/BigEndian/main.cpp (91%)
 rename bench/{ => UpperPHY}/CRC/6/LittleEndian/bench.py (85%)
 rename bench/{ => UpperPHY}/CRC/6/LittleEndian/main.cpp (91%)
 rename bench/{ConvCoding/Decoding => UpperPHY/ConvolutionalDecoder}/bench.py (85%)
 rename bench/{ConvCoding/Decoding => UpperPHY/ConvolutionalDecoder}/main.cpp (94%)
 rename bench/{ConvCoding/Encoding => UpperPHY/ConvolutionalEncoder}/bench.py (85%)
 rename bench/{ConvCoding/Encoding => UpperPHY/ConvolutionalEncoder}/main.cpp (92%)
 rename bench/{ => UpperPHY}/Demodulation/bench.py (85%)
 rename bench/{ => UpperPHY}/Demodulation/main.cpp (95%)
 rename bench/{ => UpperPHY}/LDPC/Decoding/bench.py (92%)
 rename bench/{ => UpperPHY}/LDPC/Decoding/main.cpp (95%)
 rename bench/{ => UpperPHY}/LDPC/Encoding/bench.py (90%)
 rename bench/{ => UpperPHY}/LDPC/Encoding/main.cpp (95%)
 rename bench/{ => UpperPHY}/LDPC/RateMatching/bench.py (90%)
 rename bench/{ => UpperPHY}/LDPC/RateMatching/main.cpp (96%)
 rename bench/{ => UpperPHY}/LDPC/RateRecovery/bench.py (90%)
 rename bench/{ => UpperPHY}/LDPC/RateRecovery/main.cpp (95%)
 rename bench/{ => UpperPHY}/Modulation/bench.py (86%)
 rename bench/{ => UpperPHY}/Modulation/main.cpp (95%)
 rename bench/{ => UpperPHY}/Polar/Decoding/bench.py (88%)
 rename bench/{ => UpperPHY}/Polar/Decoding/main.cpp (89%)
 rename bench/{ => UpperPHY}/Polar/Encoding/bench.py (84%)
 rename bench/{ => UpperPHY}/Polar/Encoding/main.cpp (89%)
 rename bench/{ => UpperPHY}/Polar/Frozen/bench.py (93%)
 rename bench/{ => UpperPHY}/Polar/Frozen/main.cpp (93%)
 rename bench/{ => UpperPHY}/Polar/RateMatching/bench.py (91%)
 rename bench/{ => UpperPHY}/Polar/RateMatching/main.cpp (94%)
 rename bench/{ => UpperPHY}/Polar/RateRecovery/bench.py (91%)
 rename bench/{ => UpperPHY}/Polar/RateRecovery/main.cpp (94%)
 rename bench/{ => UpperPHY}/Polar/SubchannelDeinterleave/bench.py (88%)
 rename bench/{ => UpperPHY}/Polar/SubchannelDeinterleave/main.cpp (93%)
 rename bench/{ => UpperPHY}/Polar/SubchannelInterleave/bench.py (88%)
 rename bench/{ => UpperPHY}/Polar/SubchannelInterleave/main.cpp (93%)
 rename bench/{ => UpperPHY}/Turbo/Decoding/bench.py (84%)
 rename bench/{ => UpperPHY}/Turbo/Decoding/main.cpp (94%)
 rename bench/{ => UpperPHY}/Turbo/Encoding/bench.py (84%)
 rename bench/{ => UpperPHY}/Turbo/Encoding/main.cpp (95%)
 rename bench/{ => UpperPHY}/Turbo/RateMatching/bench.py (86%)
 rename bench/{ => UpperPHY}/Turbo/RateMatching/main.cpp (95%)
 rename bench/{ => UpperPHY}/Turbo/RateRecovery/bench.py (86%)
 rename bench/{ => UpperPHY}/Turbo/RateRecovery/main.cpp (95%)
 create mode 100755 python/benchmark_excel_summary.py
 create mode 100644 python/requirements.txt
 rename simulation/awgn/{awgn.h => awgn.hpp} (82%)
 delete mode 100644 simulation/convolutional_awgn/CMakeLists.txt
 delete mode 100644 simulation/ldpc_awgn/CMakeLists.txt
 delete mode 100644 simulation/modulation_awgn/CMakeLists.txt
 delete mode 100644 simulation/polar_awgn/CMakeLists.txt
 delete mode 100644 simulation/turbo_awgn/CMakeLists.txt
 create mode 100644 src/DuRuInterface/bit_unpacking_common.hpp
 rename src/{ => MatrixFactorizations}/SVD/arm_svd.cpp (90%)
 rename src/{ => MatrixFactorizations}/SVD/matrix_view.hpp (83%)
 create mode 100644 src/UpperPHY/Turbo/turbo_decoder_fp16.hpp
 create mode 100644 src/UpperPHY/Turbo/turbo_decoder_fp32.hpp
 rename test/{MatrixInv/batch => BasicMathFun/MatrixInv/Batch}/main.cpp (87%)
 rename test/{MatrixInv/single => BasicMathFun/MatrixInv/Single}/main.cpp (83%)
 rename test/{MatrixMult/batch => BasicMathFun/MatrixMult/Batch}/ArmSolve/main.cpp (97%)
 rename test/{MatrixMult/batch => BasicMathFun/MatrixMult/Batch}/MatrixVectorMult16/main.cpp (98%)
 rename test/{MatrixMult/batch => BasicMathFun/MatrixMult/Batch}/MatrixVectorMult32/main.cpp (97%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixMult16/main.cpp (94%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixMult32/main.cpp (94%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixMultAAH32/main.cpp (88%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixMultAHB32/main.cpp (93%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixVectorMult16/main.cpp (94%)
 rename test/{MatrixMult/single => BasicMathFun/MatrixMult/Single}/MatrixVectorMult32/main.cpp (89%)
 rename test/{MatrixPseudoInv/direct => BasicMathFun/MatrixPseudoInv/Direct}/main.cpp (82%)
 rename test/{VectorDotProd/vecDot16 => BasicMathFun/VectorDotProd/VecDot16}/main.cpp (91%)
 rename test/{VectorDotProd/vecDot16_2 => BasicMathFun/VectorDotProd/VecDot16_2}/main.cpp (92%)
 rename test/{VectorDotProd/vecDot16_2_32bit => BasicMathFun/VectorDotProd/VecDot16_2_32bit}/main.cpp (93%)
 rename test/{VectorDotProd/vecDot16_32bit => BasicMathFun/VectorDotProd/VecDot16_32bit}/main.cpp (92%)
 rename test/{VectorDotProd/vecDot32 => BasicMathFun/VectorDotProd/VecDot32}/main.cpp (82%)
 rename test/{VectorDotProd/vecDot32_2 => BasicMathFun/VectorDotProd/VecDot32_2}/main.cpp (87%)
 rename test/{ElemWiseVectorMult/vecMul16 => BasicMathFun/VectorMult/VecMul16}/main.cpp (97%)
 rename test/{ElemWiseVectorMult/vecMul16_2 => BasicMathFun/VectorMult/VecMul16_2}/main.cpp (97%)
 rename test/{ElemWiseVectorMult/vecMul32 => BasicMathFun/VectorMult/VecMul32}/main.cpp (89%)
 rename test/{ElemWiseVectorMult/vecMul32_2 => BasicMathFun/VectorMult/VecMul32_2}/main.cpp (92%)
 rename test/{ => DuRuInterface}/MuLaw/Compression/main.cpp (99%)
 rename test/{ => DuRuInterface}/MuLaw/Decompression/main.cpp (98%)
 rename test/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Compression/main.cpp (99%)
 rename test/{XRanBlockFloat => DuRuInterface/ORanBlockFloat}/Decompression/main.cpp (98%)
 rename test/{ => DuRuInterface}/ORanBlockScaling/Compression/main.cpp (99%)
 rename test/{ => DuRuInterface}/ORanBlockScaling/Decompression/main.cpp (98%)
 rename test/{ => LowerPHY}/Correlation/main.cpp (97%)
 rename test/{FFT/cs16 => LowerPHY/FFT/FFT16}/main.cpp (86%)
 rename test/{FFT/cf32 => LowerPHY/FFT/FFT32}/main.cpp (92%)
 rename test/{FIR/arm_fir_filter_cs16 => LowerPHY/FIR/FIR16}/main.cpp (94%)
 rename test/{FIR/arm_fir_filter_cs16_decimate_2 => LowerPHY/FIR/FIR16Decimate2}/main.cpp (95%)
 rename test/{FIR/arm_fir_filter_cf32 => LowerPHY/FIR/FIR32}/main.cpp (94%)
 rename test/{FIR/arm_fir_filter_cf32_decimate_2 => LowerPHY/FIR/FIR32Decimate2}/main.cpp (94%)
 rename test/{ => LowerPHY}/Scrambling/main.cpp (93%)
 rename test/{ => LowerPHY}/SeqGenerator/main.cpp (95%)
 rename test/{ => MatrixFactorizations}/SVD/main.cpp (89%)
 rename test/{ => MatrixFactorizations}/SVD/svd_sample_data.h (93%)
 rename test/{ => MatrixFactorizations}/SVD/svd_test.hpp (85%)
 rename test/{ => UpperPHY}/CRC/main.cpp (97%)
 rename test/{ConvCoding/decoding => UpperPHY/ConvolutionalDecoder}/main.cpp (97%)
 rename test/{ConvCoding/encoding => UpperPHY/ConvolutionalEncoder}/main.cpp (96%)
 rename test/{ => UpperPHY}/Demodulation/main.cpp (97%)
 rename test/{LDPC/decoding => UpperPHY/LDPC/Decoding}/main.cpp (98%)
 rename test/{LDPC/encoding => UpperPHY/LDPC/Encoding}/ldpc_encoding_test_data.h (98%)
 rename test/{LDPC/encoding => UpperPHY/LDPC/Encoding}/main.cpp (98%)
 rename test/{LDPC/rate_matching => UpperPHY/LDPC/RateMatching}/main.cpp (99%)
 rename test/{LDPC/rate_recovery => UpperPHY/LDPC/RateRecovery}/main.cpp (99%)
 rename test/{ => UpperPHY}/LDPC/ldpc_test_common.hpp (95%)
 rename test/{ => UpperPHY}/Modulation/main.cpp (99%)
 rename test/{Polar/crc_attachment => UpperPHY/Polar/CrcAttachment}/main.cpp (93%)
 rename test/{Polar/crc_attachment => UpperPHY/Polar/CrcAttachment}/polar_crc_attach_data.hpp (85%)
 rename test/{Polar/decoding => UpperPHY/Polar/Decoding}/main.cpp (97%)
 rename test/{Polar/encoding => UpperPHY/Polar/Encoding}/main.cpp (92%)
 rename test/{Polar/frozen => UpperPHY/Polar/Frozen}/main.cpp (98%)
 rename test/{Polar/rate_matching => UpperPHY/Polar/RateMatching}/main.cpp (98%)
 rename test/{Polar/rate_recovery => UpperPHY/Polar/RateRecovery}/main.cpp (98%)
 rename test/{Polar/subchannel_deinterleave => UpperPHY/Polar/SubchannelDeinterleave}/main.cpp (92%)
 rename test/{Polar/subchannel_interleave => UpperPHY/Polar/SubchannelInterleave}/main.cpp (95%)
 rename test/{Turbo/decoding => UpperPHY/Turbo/Decoding}/main.cpp (96%)
 rename test/{Turbo/encoding => UpperPHY/Turbo/Encoding}/main.cpp (95%)
 rename test/{Turbo/encoding => UpperPHY/Turbo/Encoding}/reference_turbo_encoder.hpp (99%)
 rename test/{Turbo/rate_matching => UpperPHY/Turbo/RateMatching}/main.cpp (99%)
 rename test/{Turbo/rate_recovery => UpperPHY/Turbo/RateRecovery}/main.cpp (94%)
 rename test/{Turbo/rate_recovery => UpperPHY/Turbo/RateRecovery}/rate_recovery_data.hpp (99%)
 rename test/{ => UpperPHY}/Turbo/turbo_test_data.hpp (89%)

diff --git a/.gitlab/merge_request_templates/Bugfix.md b/.gitlab/merge_request_templates/Bugfix.md
index e7bce9b..9af681e 100644
--- a/.gitlab/merge_request_templates/Bugfix.md
+++ b/.gitlab/merge_request_templates/Bugfix.md
@@ -11,6 +11,7 @@ If an [Issue](https://gitlab.arm.com/networking/ral/-/issues) already exists for
 * [] ["Unreleased" section of the Changelog updated](https://gitlab.arm.com/networking/ral/-/blob/main/CHANGELOG.md#unreleased)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 * [] Commit message includes information on how to reproduce the issue(s)
 * [] [Tests added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-tests)
 * [] [Tests pass when run with AddressSanitizer](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md?ref_type=heads#user-content-testing-with-addresssanitizer)
diff --git a/.gitlab/merge_request_templates/Default.md b/.gitlab/merge_request_templates/Default.md
index 7e12edd..ced68b5 100644
--- a/.gitlab/merge_request_templates/Default.md
+++ b/.gitlab/merge_request_templates/Default.md
@@ -15,6 +15,7 @@ If this Merge Request addresses an [Issue](https://gitlab.arm.com/networking/ral
 * [] ["Unreleased" section of the Changelog updated](https://gitlab.arm.com/networking/ral/-/blob/main/CHANGELOG.md#unreleased)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 * [] [Tests added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-tests)
 * [] [Tests pass when run with AddressSanitizer](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md?ref_type=heads#user-content-testing-with-addresssanitizer)
 * [] [Benchmarks added or updated](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-writing-benchmarks)
diff --git a/.gitlab/merge_request_templates/Documentation.md b/.gitlab/merge_request_templates/Documentation.md
index ab24853..8803932 100644
--- a/.gitlab/merge_request_templates/Documentation.md
+++ b/.gitlab/merge_request_templates/Documentation.md
@@ -13,5 +13,6 @@ If this Merge Request addresses an [Issue](https://gitlab.arm.com/networking/ral
 * [] [`make docs` target runs successfully](https://gitlab.arm.com/networking/ral/-/blob/main/README.md?ref_type=heads#user-content-documentation)
 * [] [`clang-format` and `clang-tidy` run and changes included (C/C++ code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cc-code-style)
 * [] [`flake8` run and changes included (Python code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-python-code-style)
+* [] [`cmake-format` run and changes included (CMake code)](https://gitlab.arm.com/networking/ral/-/blob/main/CONTRIBUTING.md#user-content-cmake-code-style)
 
 For any items that are not checked, please provide details.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7959c2d..b5085a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,11 +8,6 @@ documented in this file.
 ### Added
 
 ### Changed
-- Moved `license_terms/BSD-3-Clause.txt` and
-`license_terms/third_party_licenses.txt` to
-[LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and
-[THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md)
-respectively.
 
 ### Deprecated
 
@@ -22,6 +17,30 @@ respectively.
 
 ### Security
 
+## [24.04] - 2024-04-19
+
+### Added
+- Makefile target `bench_excel_summary` to run the benchmarks and create an
+Excel spreadsheet containing the results.
+
+### Changed
+- Moved `license_terms/BSD-3-Clause.txt` and
+`license_terms/third_party_licenses.txt` to
+[LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md) and
+[THIRD_PARTY_LICENSES.md](https://gitlab.arm.com/networking/ral/-/blob/main/THIRD_PARTY_LICENSES.md)
+respectively.
+
+- Extended `armral_cmplx_pseudo_inverse_direct_f32` and
+`armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the regularized
+pseudo-inverse of a single complex 32-bit matrix of size `M-by-N` for the case
+where `M` and/or `N` == 1.
+
+- Improved SVE2 performance of `armral_turbo_decode_block` and
+`armral_turbo_decode_block_noalloc`.
+
+- Improved SVE2 performance of `armral_ldpc_encode_block` and
+`armral_ldpc_encode_block_noalloc`.
+
 ## [24.01] - 2024-01-19
 
 ### Changed
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 243d864..47894de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,165 +1,184 @@
 cmake_minimum_required(VERSION 3.3)
-project(armral VERSION 24.01)
+project(armral VERSION 24.04)
 
 if(CMAKE_VERSION VERSION_GREATER 3.4)
-  # stop CMake from automatically adding -rdynamic to linker flags
-  # because the semihosting toolchain does not understand that flag
+  # Stop CMake from automatically adding -rdynamic to linker flags because it
+  # causes a warning about unused compiler options when using Clang
   cmake_policy(SET CMP0065 NEW)
 endif()
 
 # set default build type if none was specified with -DCMAKE_BUILD_TYPE=...
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting build type to RELEASE as none was specified.")
-  set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "Choose the type of build." FORCE)
+  set(CMAKE_BUILD_TYPE
+      RELEASE
+      CACHE STRING "Choose the type of build." FORCE)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release")
 endif()
 
-option(ARMRAL_ENABLE_WERROR "Enable -Werror when building the library and tests" OFF)
-option(ARMRAL_ENABLE_ASAN "Enable AddressSanitizer when building the library and tests" OFF)
-option(ARMRAL_ENABLE_EFENCE "Enable Electric Fence when building the library and tests" OFF)
-option(ARMRAL_ENABLE_COVERAGE "Enable instrumentation for generating code coverage" OFF)
+option(ARMRAL_ENABLE_WERROR
+       "Enable -Werror when building the library and tests" OFF)
+option(ARMRAL_ENABLE_ASAN
+       "Enable AddressSanitizer when building the library and tests" OFF)
+option(ARMRAL_ENABLE_EFENCE
+       "Enable Electric Fence when building the library and tests" OFF)
+option(ARMRAL_ENABLE_COVERAGE
+       "Enable instrumentation for generating code coverage" OFF)
 option(BUILD_SIMULATION "Enable building channel simulation programs" ON)
-set(ARMRAL_ARCH NEON CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
+set(ARMRAL_ARCH
+    NEON
+    CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
 set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2")
 
 set(ARMRAL_LIB_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/SVD/arm_svd.cpp
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp)
 
 # Per source file compiler flag overrides/additions
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # Enable -frename-registers for GCC release builds of arm_cmplx_mat_mult_ahb_f32.c.
-  # This improves register allocation for SVE instructions such as the indexed FMLA,
-  # which have a restricted range for the indexed operand. A patch to improve register
-  # allocation has been accepted upstream (see below) and will probably be part of GCC 14,
-  # but it is unlikely to be backported to any previous releases.
+  # Enable -frename-registers for GCC release builds of
+  # arm_cmplx_mat_mult_ahb_f32.c. This improves register allocation for SVE
+  # instructions such as the indexed FMLA, which have a restricted range for the
+  # indexed operand. A patch to improve register allocation has been accepted
+  # upstream (see below) and will probably be part of GCC 14, but it is unlikely
+  # to be backported to any previous releases.
   #
-  # See: https://github.com/gcc-mirror/gcc/commit/6d25ea520f7ed58568c9a0031409bc8e38b673f3
-  # Note: We don't universally enable this flag, as in some cases it can cause regressions.
-  set_property(SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
-    APPEND PROPERTY COMPILE_OPTIONS $<$<AND:$<CONFIG:RELEASE>,$<C_COMPILER_ID:GNU>>:-frename-registers>)
+  # See:
+  # https://github.com/gcc-mirror/gcc/commit/6d25ea520f7ed58568c9a0031409bc8e38b673f3
+  # Note: We don't universally enable this flag, as in some cases it can cause
+  # regressions.
+  set_property(
+    SOURCE
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+    APPEND
+    PROPERTY COMPILE_OPTIONS
+             $<$<AND:$<CONFIG:RELEASE>,$<C_COMPILER_ID:GNU>>:-frename-registers>
+  )
 
   if(ARMRAL_ENABLE_WERROR)
-    # Disable warnings-as-errors about C-style Variable Length Arrays in FFT source when using Clang++
-    set_property(SOURCE
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
-      APPEND PROPERTY COMPILE_OPTIONS $<$<CXX_COMPILER_ID:Clang>:-Wno-error=vla-extension>)
+    # Disable warnings-as-errors about C-style Variable Length Arrays in FFT
+    # source when using Clang++
+    set_property(
+      SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+             ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+             ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+      APPEND
+      PROPERTY COMPILE_OPTIONS
+               $<$<CXX_COMPILER_ID:Clang>:-Wno-error=vla-extension>)
   endif()
 endif()
 
-set(ARMRAL_UTIL_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils/rng.cpp
-)
+set(ARMRAL_UTIL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/utils/rng.cpp)
 
 set(ARMRAL_LIB_INC
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/src
+    ${CMAKE_CURRENT_SOURCE_DIR}/utils)
 
 set(ARMRAL_TEST_INC
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo
-)
+    ${CMAKE_CURRENT_SOURCE_DIR}/utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo)
 
 set(ARMRAL_OVERRIDE_COMPILE_FLAGS FALSE)
 if(NOT CMAKE_C_FLAGS STREQUAL "")
-    if(CMAKE_CXX_FLAGS STREQUAL "")
-        message(FATAL_ERROR "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set")
-    endif()
+  if(CMAKE_CXX_FLAGS STREQUAL "")
+    message(
+      FATAL_ERROR
+        "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set"
+    )
+  endif()
 endif()
 
 if(NOT CMAKE_CXX_FLAGS STREQUAL "")
-    if(CMAKE_C_FLAGS STREQUAL "")
-        message(FATAL_ERROR "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set")
-    endif()
-    set(ARMRAL_OVERRIDE_COMPILE_FLAGS TRUE)
+  if(CMAKE_C_FLAGS STREQUAL "")
+    message(
+      FATAL_ERROR
+        "If overriding compile flags, both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS need to be set"
+    )
+  endif()
+  set(ARMRAL_OVERRIDE_COMPILE_FLAGS TRUE)
 endif()
 
 set(ARMRAL_COMPILER_FLAGS "")
@@ -167,7 +186,10 @@ set(ARMRAL_LINKER_FLAGS "")
 
 if(ARMRAL_ENABLE_WERROR)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_WERROR")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_WERROR"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -Werror)
   endif()
@@ -175,7 +197,10 @@ endif()
 
 if(ARMRAL_ENABLE_ASAN)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_ASAN")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_ASAN"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -fsanitize=address)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -fsanitize=address)
@@ -184,7 +209,10 @@ endif()
 
 if(ARMRAL_ENABLE_EFENCE)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_EFENCE")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_EFENCE"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -lefence)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -lefence)
@@ -193,16 +221,24 @@ endif()
 
 if(ARMRAL_ENABLE_COVERAGE)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_COVERAGE")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_COVERAGE"
+    )
   else()
-    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage -fprofile-update=atomic)
-    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage -fprofile-update=atomic)
+    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage
+                              -fprofile-update=atomic)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage
+                            -fprofile-update=atomic)
   endif()
 endif()
 
 if(ARMRAL_SEMIHOSTING)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
-    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_SEMIHOSTING")
+    message(
+      WARNING
+        "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_SEMIHOSTING"
+    )
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --specs=rdimon.specs)
     set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -lc -lrdimon)
@@ -214,29 +250,39 @@ if(CMAKE_VERSION VERSION_GREATER 3.15)
 endif()
 
 if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # If the optimisation flags are already set, don't try and guess what they
+  # If the optimization flags are already set, don't try and guess what they
   # should be.
   if(ARMRAL_ARCH STREQUAL "SVE2")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+sve2+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.5-a+sve2+crypto+fp16"
+        CACHE INTERNAL "")
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+sve+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8.2-a+sve+crypto+fp16"
+        CACHE INTERNAL "")
   elseif(ARMRAL_ARCH STREQUAL "NEON")
-    set(ARMRAL_ARCH_COMPILE_OPTIONS "-march=armv8-a+crypto" CACHE INTERNAL "")
+    set(ARMRAL_ARCH_COMPILE_OPTIONS
+        "-march=armv8-a+crypto"
+        CACHE INTERNAL "")
   else()
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 elseif(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
-  # We explicitly set the optimisation flags, so just copy those. We still need to
-  # set the appropriate SVE version definition
+  # We explicitly set the optimization flags, so just copy those. We still need
+  # to set the appropriate SVE version definition
   set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS})
   if(ARMRAL_ARCH STREQUAL "SVE2")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=2")
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
   elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 else()
   set(ARMRAL_ARCH_COMPILE_OPTIONS "")
@@ -245,19 +291,32 @@ else()
   elseif(ARMRAL_ARCH STREQUAL "SVE")
     set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_SVE=1")
   elseif(NOT ARMRAL_ARCH STREQUAL "NEON")
-    message(FATAL_ERROR "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
+    message(
+      FATAL_ERROR
+        "Invalid value of -DARMRAL_ARCH, should be 'NEON', 'SVE' or 'SVE2'")
   endif()
 endif()
 
 if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   set(ARMRAL_COMPILER_FLAGS
-    ${ARMRAL_COMPILER_FLAGS}
-    $<$<COMPILE_LANGUAGE:C>:-Wshadow -Wall -Wcast-qual>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17>
-    $<$<CONFIG:DEBUG>:-Og -g3 -ggdb -fno-omit-frame-pointer>)
+      ${ARMRAL_COMPILER_FLAGS}
+      $<$<COMPILE_LANGUAGE:C>:-Wshadow
+      -Wall
+      -Wcast-qual>
+      $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+      -Wall
+      -Wcast-qual
+      -fno-rtti
+      -fno-exceptions
+      -std=c++17>
+      $<$<CONFIG:DEBUG>:-Og
+      -g3
+      -ggdb
+      -fno-omit-frame-pointer>)
   message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}")
 else()
-  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the compile line
+  # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the
+  # compile line
   message(STATUS "Overriding compilation flags with manually set flags")
   message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
   message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
@@ -268,26 +327,31 @@ endif()
 add_library(armral ${ARMRAL_LIB_SOURCES})
 target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE})
-target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} ${ARMRAL_COMPILER_FLAGS})
+target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                      ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS})
 
 add_library(armral_utils ${ARMRAL_UTIL_SOURCES})
 target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC})
 target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE})
-target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} ${ARMRAL_COMPILER_FLAGS})
+target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS}
+                                            ${ARMRAL_COMPILER_FLAGS})
 target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS})
 
 if(ARMRAL_SEMIHOSTING)
-  # when semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a
-  # compiler flag, so we specify the string "ARMRAL_SEMIHOSTING"
-  # rather than the CMake variable ARMRAL_SEMIHOSTING
+  # when semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag,
+  # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable
+  # ARMRAL_SEMIHOSTING
   target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING")
   target_compile_definitions(armral_utils PUBLIC "ARMRAL_SEMIHOSTING")
 endif()
 
 install(TARGETS armral DESTINATION lib)
-install(DIRECTORY include/ DESTINATION include
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY include/
+  DESTINATION include
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES LICENSE.md THIRD_PARTY_LICENSES.md
         DESTINATION share/licenses/armral)
 
@@ -305,26 +369,36 @@ if(BUILD_TESTING)
     set(BENCHMARKER_RUNNER "${BENCHMARKER_SOURCE_DIR}/bench/default_runner.py")
   endif()
 
-  add_custom_target(check
-                    COMMAND ${CMAKE_CTEST_COMMAND}
-                    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
-  add_custom_target(bench
-                    COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-                      ${CMAKE_SOURCE_DIR}
-                      ${BENCHMARKER_BUILD_DIR}
-                      --runner ${BENCHMARKER_RUNNER}
-                    ${JOB_POOL_CONSOLE}
-                    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
-
-  add_custom_target(bench_concurrent
-                    COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-                      ${CMAKE_SOURCE_DIR}
-                      ${BENCHMARKER_BUILD_DIR}
-                      --runner ${BENCHMARKER_RUNNER}
-                      --concurrent
-                    ${JOB_POOL_CONSOLE}
-                    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+  add_custom_target(
+    check
+    COMMAND ${CMAKE_CTEST_COMMAND}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
+  add_custom_target(
+    bench
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER}
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_concurrent
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent
+      ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
+
+  add_custom_target(
+    bench_excel_summary
+    COMMAND
+      ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR}
+      ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent | tee
+      ${BENCHMARKER_BUILD_DIR}/out.json
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py
+            ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE}
+    WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR})
 
   set(ARMRAL_TEST_LINK_LIBRARIES armral armral_utils)
 
@@ -334,232 +408,365 @@ if(BUILD_TESTING)
 
   # utility function to add a test
   function(add_armral_test TEST_NAME TEST_SOURCE)
-    get_filename_component(TEST_DIR ${TEST_SOURCE} DIRECTORY)
-
     # build the actual test executable itself
     add_executable(${TEST_NAME} ${TEST_SOURCE})
-    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS})
+    target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
     target_include_directories(${TEST_NAME} PRIVATE ${ARMRAL_TEST_INC})
-    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS} ${ARMRAL_ARCH_COMPILE_OPTIONS})
+    target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}
+                                                ${ARMRAL_ARCH_COMPILE_OPTIONS})
 
     # register it as a test, set up dependencies
-    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
+    add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER}
+                                       ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME})
     add_dependencies(check ${TEST_NAME})
   endfunction()
 
   # utility function to add a benchmark
   function(add_armral_bench BENCH_NAME BENCH_SOURCE)
-    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
 
     # build the actual bench executable itself
     add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE})
-    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} ${ARMRAL_LINKER_FLAGS})
+    target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES}
+                          ${ARMRAL_LINKER_FLAGS})
     target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC})
     target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS})
 
     # register it as a benchmark, set up dependencies
     add_dependencies(bench bench_${BENCH_NAME})
     add_dependencies(bench_concurrent bench_${BENCH_NAME})
+    add_dependencies(bench_excel_summary bench_${BENCH_NAME})
 
     # add target for running the benchmark
-    add_custom_target(run_bench_${BENCH_NAME}
-      COMMAND ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
-      ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR}
-      ${BENCHMARKER_BUILD_DIR}
-      --runner ${BENCHMARKER_RUNNER}
-      --concurrent
-      ${JOB_POOL_CONSOLE}
+    get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY)
+    add_custom_target(
+      run_bench_${BENCH_NAME}
+      COMMAND
+        ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py
+        ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR} ${BENCHMARKER_BUILD_DIR}
+        --runner ${BENCHMARKER_RUNNER} --concurrent ${JOB_POOL_CONSOLE}
       WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}
       DEPENDS bench_${BENCH_NAME})
-endfunction()
-
-  add_armral_test(fft_cf32 test/FFT/cf32/main.cpp)
-  add_armral_test(fft_cs16 test/FFT/cs16/main.cpp)
-  add_armral_test(arm_fir_filter_cf32 test/FIR/arm_fir_filter_cf32/main.cpp)
-  add_armral_test(arm_fir_filter_cf32_decimate_2 test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp)
-  add_armral_test(arm_fir_filter_cs16 test/FIR/arm_fir_filter_cs16/main.cpp)
-  add_armral_test(arm_fir_filter_cs16_decimate_2 test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp)
-  add_armral_test(modulation test/Modulation/main.cpp)
-  add_armral_test(demodulation test/Demodulation/main.cpp)
-  add_armral_test(mu_law_compression test/MuLaw/Compression/main.cpp)
-  add_armral_test(mu_law_decompression test/MuLaw/Decompression/main.cpp)
-  add_armral_test(vec_dot_16_32_bit test/VectorDotProd/vecDot16_32bit/main.cpp)
-  add_armral_test(vec_dot_16_2 test/VectorDotProd/vecDot16_2/main.cpp)
-  add_armral_test(vec_dot_32 test/VectorDotProd/vecDot32/main.cpp)
-  add_armral_test(vec_dot_32_2 test/VectorDotProd/vecDot32_2/main.cpp)
-  add_armral_test(vec_dot_16_2_32_bit test/VectorDotProd/vecDot16_2_32bit/main.cpp)
-  add_armral_test(vec_dot_16 test/VectorDotProd/vecDot16/main.cpp)
-  add_armral_test(crc test/CRC/main.cpp)
-  add_armral_test(matrix_mult_16 test/MatrixMult/single/MatrixMult16/main.cpp)
-  add_armral_test(arm_solve test/MatrixMult/batch/ArmSolve/main.cpp)
-  add_armral_test(matrix_mult_32 test/MatrixMult/single/MatrixMult32/main.cpp)
-  add_armral_test(matrix_mult_aah_32 test/MatrixMult/single/MatrixMultAAH32/main.cpp)
-  add_armral_test(matrix_mult_ahb_32 test/MatrixMult/single/MatrixMultAHB32/main.cpp)
-  add_armral_test(vec_mul_32_2 test/ElemWiseVectorMult/vecMul32_2/main.cpp)
-  add_armral_test(vec_mul_16 test/ElemWiseVectorMult/vecMul16/main.cpp)
-  add_armral_test(vec_mul_32 test/ElemWiseVectorMult/vecMul32/main.cpp)
-  add_armral_test(vec_mul_16_2 test/ElemWiseVectorMult/vecMul16_2/main.cpp)
-  add_armral_test(polar_decoder test/Polar/decoding/main.cpp)
-  add_armral_test(polar_encoder test/Polar/encoding/main.cpp)
-  add_armral_test(polar_frozen_mask test/Polar/frozen/main.cpp)
-  add_armral_test(polar_rate_matching test/Polar/rate_matching/main.cpp)
-  add_armral_test(polar_rate_recovery test/Polar/rate_recovery/main.cpp)
-  add_armral_test(polar_subchannel_interleave test/Polar/subchannel_interleave/main.cpp)
-  add_armral_test(polar_subchannel_deinterleave test/Polar/subchannel_deinterleave/main.cpp)
-  add_armral_test(polar_crc_attachment test/Polar/crc_attachment/main.cpp)
-  add_armral_test(block_scaling_compression test/ORanBlockScaling/Compression/main.cpp)
-  add_armral_test(block_scaling_decompression test/ORanBlockScaling/Decompression/main.cpp)
-  add_armral_test(block_float_compression test/XRanBlockFloat/Compression/main.cpp)
-  add_armral_test(block_float_decompression test/XRanBlockFloat/Decompression/main.cpp)
-  add_armral_test(correlation test/Correlation/main.cpp)
-  add_armral_test(matrix_inv_single test/MatrixInv/single/main.cpp)
-  add_armral_test(matrix_inv_batch test/MatrixInv/batch/main.cpp)
-  add_armral_test(matrix_pseudo_inv_direct test/MatrixPseudoInv/direct/main.cpp)
-  add_armral_test(seq_generator test/SeqGenerator/main.cpp)
-  add_armral_test(scrambling test/Scrambling/main.cpp)
-  add_armral_test(ldpc_encoding test/LDPC/encoding/main.cpp)
-  add_armral_test(ldpc_decoding test/LDPC/decoding/main.cpp)
-  add_armral_test(ldpc_rate_matching test/LDPC/rate_matching/main.cpp)
-  add_armral_test(ldpc_rate_recovery test/LDPC/rate_recovery/main.cpp)
-  add_armral_test(svd test/SVD/main.cpp)
-  add_armral_test(matrix_vector_mult_single_16 test/MatrixMult/single/MatrixVectorMult16/main.cpp)
-  add_armral_test(matrix_vector_mult_single_32 test/MatrixMult/single/MatrixVectorMult32/main.cpp)
-  add_armral_test(matrix_vector_mult_batch_16 test/MatrixMult/batch/MatrixVectorMult16/main.cpp)
-  add_armral_test(matrix_vector_mult_batch_32 test/MatrixMult/batch/MatrixVectorMult32/main.cpp)
-  add_armral_test(turbo_encoding test/Turbo/encoding/main.cpp)
-  add_armral_test(turbo_decoding test/Turbo/decoding/main.cpp)
-  add_armral_test(turbo_rate_matching test/Turbo/rate_matching/main.cpp)
-  add_armral_test(turbo_rate_recovery test/Turbo/rate_recovery/main.cpp)
-  add_armral_test(tail_biting_convolutional_encoding test/ConvCoding/encoding/main.cpp)
-  add_armral_test(tail_biting_convolutional_decoding test/ConvCoding/decoding/main.cpp)
-
-  add_armral_bench(correlation bench/Correlation/main.cpp)
-  add_armral_bench(crc_6_be bench/CRC/6/BigEndian/main.cpp)
-  add_armral_bench(crc_6_le bench/CRC/6/LittleEndian/main.cpp)
-  add_armral_bench(crc_11_be bench/CRC/11/BigEndian/main.cpp)
-  add_armral_bench(crc_11_le bench/CRC/11/LittleEndian/main.cpp)
-  add_armral_bench(crc_16_be bench/CRC/16/BigEndian/main.cpp)
-  add_armral_bench(crc_16_le bench/CRC/16/LittleEndian/main.cpp)
-  add_armral_bench(crc_24a_be bench/CRC/24/A/BigEndian/main.cpp)
-  add_armral_bench(crc_24a_le bench/CRC/24/A/LittleEndian/main.cpp)
-  add_armral_bench(crc_24b_be bench/CRC/24/B/BigEndian/main.cpp)
-  add_armral_bench(crc_24b_le bench/CRC/24/B/LittleEndian/main.cpp)
-  add_armral_bench(crc_24c_be bench/CRC/24/C/BigEndian/main.cpp)
-  add_armral_bench(crc_24c_le bench/CRC/24/C/LittleEndian/main.cpp)
-  add_armral_bench(demodulation bench/Demodulation/main.cpp)
-  add_armral_bench(vec_mul_16 bench/ElemWiseVectorMult/VecMul16/main.cpp)
-  add_armral_bench(vec_mul_16_2 bench/ElemWiseVectorMult/VecMul16_2/main.cpp)
-  add_armral_bench(vec_mul_32 bench/ElemWiseVectorMult/VecMul32/main.cpp)
-  add_armral_bench(vec_mul_32_2 bench/ElemWiseVectorMult/VecMul32_2/main.cpp)
-  add_armral_bench(fft_cs16 bench/FFT/FFT16/main.cpp)
-  add_armral_bench(fft_cf32 bench/FFT/FFT32/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16 bench/FIR/FIR16/main.cpp)
-  add_armral_bench(arm_fir_filter_cs16_decimate_2 bench/FIR/FIR16Decimate2/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32 bench/FIR/FIR32/main.cpp)
-  add_armral_bench(arm_fir_filter_cf32_decimate_2 bench/FIR/FIR32Decimate2/main.cpp)
-  add_armral_bench(ldpc_decoding bench/LDPC/Decoding/main.cpp)
-  add_armral_bench(ldpc_encoding bench/LDPC/Encoding/main.cpp)
-  add_armral_bench(ldpc_rate_matching bench/LDPC/RateMatching/main.cpp)
-  add_armral_bench(ldpc_rate_recovery bench/LDPC/RateRecovery/main.cpp)
-  add_armral_bench(matrix_inv_single_general bench/MatrixInv/Single/GeneralMatInv/main.cpp)
-  add_armral_bench(matrix_inv_single_hermitian bench/MatrixInv/Single/HermitianMatInv/main.cpp)
-  add_armral_bench(matrix_inv_batch_general bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
-  add_armral_bench(matrix_inv_batch_general_pa bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
-  add_armral_bench(matrix_inv_batch_hermitian bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
-  add_armral_bench(matrix_inv_batch_hermitian_pa bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
-  add_armral_bench(matrix_pseudo_inv_direct bench/MatrixPseudoInv/Direct/main.cpp)
-  add_armral_bench(arm_solve_1x2 bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
-  add_armral_bench(arm_solve_1x4 bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
-  add_armral_bench(arm_solve_2x2 bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
-  add_armral_bench(arm_solve_2x4 bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
-  add_armral_bench(arm_solve_4x4 bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
-  add_armral_bench(matrix_mult_i16_32b bench/MatrixMult/Single/MatrixMult16/32b/main.cpp)
-  add_armral_bench(matrix_mult_i16_64b bench/MatrixMult/Single/MatrixMult16/64b/main.cpp)
-  add_armral_bench(matrix_mult_f32_2x2 bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_2x2_iq bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_4x4 bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_4x4_iq bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
-  add_armral_bench(matrix_mult_f32_general bench/MatrixMult/Single/MatrixMult32/general/main.cpp)
-  add_armral_bench(matrix_mult_ahb_32 bench/MatrixMult/Single/MatrixMultAHB32/main.cpp)
-  add_armral_bench(matrix_mult_aah_32 bench/MatrixMult/Single/MatrixMultAAH32/main.cpp)
-  add_armral_bench(modulation bench/Modulation/main.cpp)
-  add_armral_bench(mu_law_compression_8bit bench/MuLaw/Compression/8bit/main.cpp)
-  add_armral_bench(mu_law_compression_9bit bench/MuLaw/Compression/9bit/main.cpp)
-  add_armral_bench(mu_law_compression_14bit bench/MuLaw/Compression/14bit/main.cpp)
-  add_armral_bench(mu_law_decompression_8bit bench/MuLaw/Decompression/8bit/main.cpp)
-  add_armral_bench(mu_law_decompression_9bit bench/MuLaw/Decompression/9bit/main.cpp)
-  add_armral_bench(mu_law_decompression_14bit bench/MuLaw/Decompression/14bit/main.cpp)
-  add_armral_bench(block_scaling_compression_8bit bench/ORanBlockScaling/Compression/8bit/main.cpp)
-  add_armral_bench(block_scaling_compression_9bit bench/ORanBlockScaling/Compression/9bit/main.cpp)
-  add_armral_bench(block_scaling_compression_14bit bench/ORanBlockScaling/Compression/14bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_8bit bench/ORanBlockScaling/Decompression/8bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_9bit bench/ORanBlockScaling/Decompression/9bit/main.cpp)
-  add_armral_bench(block_scaling_decompression_14bit bench/ORanBlockScaling/Decompression/14bit/main.cpp)
-  add_armral_bench(block_float_compression_8bit bench/XRanBlockFloat/Compression/8bit/main.cpp)
-  add_armral_bench(block_float_compression_9bit bench/XRanBlockFloat/Compression/9bit/main.cpp)
-  add_armral_bench(block_float_compression_12bit bench/XRanBlockFloat/Compression/12bit/main.cpp)
-  add_armral_bench(block_float_compression_14bit bench/XRanBlockFloat/Compression/14bit/main.cpp)
-  add_armral_bench(block_float_decompression_8bit bench/XRanBlockFloat/Decompression/8bit/main.cpp)
-  add_armral_bench(block_float_decompression_9bit bench/XRanBlockFloat/Decompression/9bit/main.cpp)
-  add_armral_bench(block_float_decompression_12bit bench/XRanBlockFloat/Decompression/12bit/main.cpp)
-  add_armral_bench(block_float_decompression_14bit bench/XRanBlockFloat/Decompression/14bit/main.cpp)
-  add_armral_bench(polar_decoder bench/Polar/Decoding/main.cpp)
-  add_armral_bench(polar_encoder bench/Polar/Encoding/main.cpp)
-  add_armral_bench(polar_frozen_mask bench/Polar/Frozen/main.cpp)
-  add_armral_bench(polar_rate_matching bench/Polar/RateMatching/main.cpp)
-  add_armral_bench(polar_rate_recovery bench/Polar/RateRecovery/main.cpp)
-  add_armral_bench(polar_subchannel_deinterleave bench/Polar/SubchannelDeinterleave/main.cpp)
-  add_armral_bench(polar_subchannel_interleave bench/Polar/SubchannelInterleave/main.cpp)
-  add_armral_bench(seq_generator bench/SeqGenerator/main.cpp)
-  add_armral_bench(scrambling bench/Scrambling/main.cpp)
-  add_armral_bench(svd bench/SVD/main.cpp)
-  add_armral_bench(vec_dot_16 bench/VectorDotProd/VecDot16/main.cpp)
-  add_armral_bench(vec_dot_16_2 bench/VectorDotProd/VecDot16_2/main.cpp)
-  add_armral_bench(vec_dot_16_2_32_bit bench/VectorDotProd/VecDot16_2_32bit/main.cpp)
-  add_armral_bench(vec_dot_16_32_bit bench/VectorDotProd/VecDot16_32bit/main.cpp)
-  add_armral_bench(vec_dot_32 bench/VectorDotProd/VecDot32/main.cpp)
-  add_armral_bench(vec_dot_32_2 bench/VectorDotProd/VecDot32_2/main.cpp)
-  add_armral_bench(matrix_vector_mult_i16_32b bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
-  add_armral_bench(matrix_vector_mult_i16_64b bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
-  add_armral_bench(matrix_vector_mult_32 bench/MatrixMult/Single/MatrixVectorMult32/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_32b bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_32b_pa bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_64b bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_i16_64b_pa bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_f32 bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
-  add_armral_bench(matrix_vector_mult_batch_f32_pa bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
-  add_armral_bench(turbo_encoding bench/Turbo/Encoding/main.cpp)
-  add_armral_bench(turbo_decoding bench/Turbo/Decoding/main.cpp)
-  add_armral_bench(turbo_rate_matching bench/Turbo/RateMatching/main.cpp)
-  add_armral_bench(turbo_rate_recovery bench/Turbo/RateRecovery/main.cpp)
-  add_armral_bench(tail_biting_convolutional_encoding bench/ConvCoding/Encoding/main.cpp)
-  add_armral_bench(tail_biting_convolutional_decoding bench/ConvCoding/Decoding/main.cpp)
+  endfunction()
+
+  add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp)
+  add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp)
+  add_armral_test(arm_solve
+                  test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_16
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_batch_32
+    test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_mult_16
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp)
+  add_armral_test(matrix_mult_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp)
+  add_armral_test(matrix_mult_aah_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_test(matrix_mult_ahb_32
+                  test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_16
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp)
+  add_armral_test(
+    matrix_vector_mult_single_32
+    test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_test(matrix_pseudo_inv_direct
+                  test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_test(vec_dot_16_2
+                  test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_test(vec_dot_16_2_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_test(vec_dot_16_32_bit
+                  test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_test(vec_dot_32_2
+                  test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_test(mu_law_compression
+                  test/DuRuInterface/MuLaw/Compression/main.cpp)
+  add_armral_test(mu_law_decompression
+                  test/DuRuInterface/MuLaw/Decompression/main.cpp)
+  add_armral_test(block_float_compression
+                  test/DuRuInterface/ORanBlockFloat/Compression/main.cpp)
+  add_armral_test(block_float_decompression
+                  test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp)
+  add_armral_test(block_scaling_compression
+                  test/DuRuInterface/ORanBlockScaling/Compression/main.cpp)
+  add_armral_test(block_scaling_decompression
+                  test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp)
+  add_armral_test(correlation test/LowerPHY/Correlation/main.cpp)
+  add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_test(arm_fir_filter_cs16_decimate_2
+                  test/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_test(arm_fir_filter_cf32_decimate_2
+                  test/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp)
+  add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_test(crc test/UpperPHY/CRC/main.cpp)
+  add_armral_test(tail_biting_convolutional_decoding
+                  test/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_test(tail_biting_convolutional_encoding
+                  test/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp)
+  add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_test(modulation test/UpperPHY/Modulation/main.cpp)
+  add_armral_test(polar_crc_attachment
+                  test/UpperPHY/Polar/CrcAttachment/main.cpp)
+  add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_test(polar_subchannel_deinterleave
+                  test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_test(polar_subchannel_interleave
+                  test/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp)
+
+  add_armral_bench(
+    matrix_inv_batch_general
+    bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
+  add_armral_bench(matrix_inv_batch_general_pa
+                   bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_inv_batch_hermitian_pa
+    bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp)
+  add_armral_bench(matrix_inv_single_general
+                   bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp)
+  add_armral_bench(matrix_inv_single_hermitian
+                   bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp)
+  add_armral_bench(arm_solve_1x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp)
+  add_armral_bench(arm_solve_1x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp)
+  add_armral_bench(arm_solve_2x2
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp)
+  add_armral_bench(arm_solve_2x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp)
+  add_armral_bench(arm_solve_4x4
+                   bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_32b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_i16_64b_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_batch_f32_pa
+    bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp)
+  add_armral_bench(
+    matrix_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_2x2
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4_iq
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_4x4
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp)
+  add_armral_bench(
+    matrix_mult_f32_general
+    bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp)
+  add_armral_bench(
+    matrix_mult_aah_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp)
+  add_armral_bench(
+    matrix_mult_ahb_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_32b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_i16_64b
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp)
+  add_armral_bench(
+    matrix_vector_mult_32
+    bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp)
+  add_armral_bench(matrix_pseudo_inv_direct
+                   bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp)
+  add_armral_bench(vec_dot_16
+                   bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp)
+  add_armral_bench(vec_dot_16_2
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp)
+  add_armral_bench(vec_dot_16_2_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp)
+  add_armral_bench(vec_dot_16_32_bit
+                   bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp)
+  add_armral_bench(vec_dot_32
+                   bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp)
+  add_armral_bench(vec_dot_32_2
+                   bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp)
+  add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp)
+  add_armral_bench(vec_mul_16_2
+                   bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp)
+  add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp)
+  add_armral_bench(vec_mul_32_2
+                   bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp)
+  add_armral_bench(mu_law_compression_14bit
+                   bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp)
+  add_armral_bench(mu_law_compression_8bit
+                   bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp)
+  add_armral_bench(mu_law_compression_9bit
+                   bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp)
+  add_armral_bench(mu_law_decompression_14bit
+                   bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp)
+  add_armral_bench(mu_law_decompression_8bit
+                   bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp)
+  add_armral_bench(mu_law_decompression_9bit
+                   bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_compression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp)
+  add_armral_bench(block_float_compression_8bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp)
+  add_armral_bench(block_float_compression_9bit
+                   bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_12bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_14bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_8bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_float_decompression_9bit
+    bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_compression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_14bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_8bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp)
+  add_armral_bench(
+    block_scaling_decompression_9bit
+    bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp)
+  add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp)
+  add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp)
+  add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp)
+  add_armral_bench(arm_fir_filter_cs16_decimate_2
+                   bench/LowerPHY/FIR/FIR16Decimate2/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp)
+  add_armral_bench(arm_fir_filter_cf32_decimate_2
+                   bench/LowerPHY/FIR/FIR32Decimate2/main.cpp)
+  add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp)
+  add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp)
+  add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp)
+  add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp)
+  add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp)
+  add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp)
+  add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp)
+  add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp)
+  add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp)
+  add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp)
+  add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp)
+  add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp)
+  add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp)
+  add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp)
+  add_armral_bench(tail_biting_convolutional_decoding
+                   bench/UpperPHY/ConvolutionalDecoder/main.cpp)
+  add_armral_bench(tail_biting_convolutional_encoding
+                   bench/UpperPHY/ConvolutionalEncoder/main.cpp)
+  add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp)
+  add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp)
+  add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp)
+  add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp)
+  add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp)
+  add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp)
+  add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp)
+  add_armral_bench(polar_rate_matching
+                   bench/UpperPHY/Polar/RateMatching/main.cpp)
+  add_armral_bench(polar_rate_recovery
+                   bench/UpperPHY/Polar/RateRecovery/main.cpp)
+  add_armral_bench(polar_subchannel_deinterleave
+                   bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp)
+  add_armral_bench(polar_subchannel_interleave
+                   bench/UpperPHY/Polar/SubchannelInterleave/main.cpp)
+  add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp)
+  add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp)
+  add_armral_bench(turbo_rate_matching
+                   bench/UpperPHY/Turbo/RateMatching/main.cpp)
+  add_armral_bench(turbo_rate_recovery
+                   bench/UpperPHY/Turbo/RateRecovery/main.cpp)
+  add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp)
 endif()
 
 if(BUILD_EXAMPLES)
-  add_custom_target(make_examples_dir ALL
-                    COMMAND ${CMAKE_COMMAND} -E make_directory examples)
+  add_custom_target(make_examples_dir ALL COMMAND ${CMAKE_COMMAND} -E
+                                                  make_directory examples)
   add_custom_target(examples)
   add_custom_target(run_examples)
   add_dependencies(run_examples examples)
 
-  # Any parameters after the first one will be passed as parameters
-  # to the example executable when running it
+  # Any parameters after the first one will be passed as parameters to the
+  # example executable when running it
   function(add_armral_example EXAMPLE_SOURCE)
     get_filename_component(EXAMPLE_EXE ${EXAMPLE_SOURCE} NAME_WE)
     add_executable(${EXAMPLE_EXE} ${EXAMPLE_SOURCE})
     add_dependencies(${EXAMPLE_EXE} make_examples_dir)
     set(EXAMPLE_OUTPUT_NAME examples/${EXAMPLE_EXE})
-    set_target_properties(${EXAMPLE_EXE}
-                          PROPERTIES
-                          OUTPUT_NAME ${EXAMPLE_OUTPUT_NAME})
+    set_target_properties(${EXAMPLE_EXE} PROPERTIES OUTPUT_NAME
+                                                    ${EXAMPLE_OUTPUT_NAME})
 
     target_link_libraries(${EXAMPLE_EXE} armral m)
 
-    add_custom_target(run_${EXAMPLE_EXE}
-                      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
-                      DEPENDS ${EXAMPLE_EXE}
-    )
+    add_custom_target(
+      run_${EXAMPLE_EXE}
+      COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN}
+      DEPENDS ${EXAMPLE_EXE})
     add_dependencies(examples ${EXAMPLE_EXE})
     add_dependencies(run_examples run_${EXAMPLE_EXE})
   endfunction()
@@ -571,44 +778,50 @@ if(BUILD_EXAMPLES)
 endif()
 
 if(BUILD_SIMULATION)
-  # Include simulation rules and targets
-  # This involves building dependencies like AWGN library and OpenMP
+  # Include simulation rules and targets This involves building dependencies
+  # like AWGN library and OpenMP
   add_subdirectory(simulation)
 endif()
 
 find_package(Doxygen)
-if (DOXYGEN_FOUND)
+if(DOXYGEN_FOUND)
   set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in)
   set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
   configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY)
   add_custom_target(docs COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT})
 endif()
 
-# uninstall target
+# Create target to uninstall the library
 if(NOT TARGET uninstall)
   configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
-    IMMEDIATE @ONLY)
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
 
-  add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+  add_custom_target(
+    uninstall COMMAND ${CMAKE_COMMAND} -P
+                      ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
 endif()
 
-
 # Check that the C and C++ compilers are from the same toolchain
-if (NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
-  message(FATAL_ERROR "CXX and C compiler providers differ. Please specify the same compiler toolchain")
+if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
+  message(
+    FATAL_ERROR
+      "CXX and C compiler providers differ. Please specify the same compiler toolchain"
+  )
 endif()
 
-set (COMP_ERR_MSG "Compilation is only supported with GNU versions 7, 8, 9, 10, \
+set(COMP_ERR_MSG
+    "Compilation is only supported with GNU versions 7, 8, 9, 10, \
                   11, 12, 13, or Clang versions greater than or equal to 12.0.1. \
-                  If compilation fails please use one of the supported compilers.")
-if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION VERSION_GREATER 13.2)
+                  If compilation fails please use one of the supported compilers."
+)
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION
+                                                  VERSION_GREATER 13.2)
     message(WARNING ${COMP_ERR_MSG})
   endif()
-elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 12.0.1)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  if(CMAKE_C_COMPILER_VERSION VERSION_LESS 12.0.1)
     message(WARNING ${COMP_ERR_MSG})
   endif()
 else()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 53a8a30..d4b42d1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,7 +1,7 @@
-# Contributing to Arm RAN Acceleration Library (Arm RAL)
+# Contributing to Arm RAN Acceleration Library (ArmRAL)
 
 Describes the requirements for contributing code to Arm RAN
-Acceleration Library (Arm RAL):
+Acceleration Library (ArmRAL):
 
 - The license;
 - How to write and submit patches;
@@ -47,13 +47,15 @@ any relevant reasoning.
 
 ## Function naming
 
-Arm RAL functions are named according to:
+ArmRAL functions are named according to:
 
-    armral_<algorithm>_<precision>{_variant}
+```
+armral_<algorithm>_<precision>{_variant}
+```
 
 where:
 
-- *algorithm* is a word or words that summarises the main purpose of
+- *algorithm* is a word or words that summarizes the main purpose of
   the function;
 
 - *precision* indicates the working precision of the internals of the
@@ -62,13 +64,13 @@ where:
 
   For Fast Fourier Transform (FFT) functions use:
 
-  - `cf32`: complex 32-bit floating point;
-  - `cs16`: complex signed 16-bit integer.
+   - `cf32`: complex 32-bit floating point;
+   - `cs16`: complex signed 16-bit integer.
 
   For all other functions use:
 
-  - `f32`: 32-bit floating point;
-  - `i16`: signed 16-bit integer.
+   - `f32`: 32-bit floating point;
+   - `i16`: signed 16-bit integer.
 
 - *variant* is an optional suffix to distinguish different
   implementations of the same *algorithm* at the same *precision*.
@@ -82,9 +84,9 @@ Examples from the library:
 `armral_cmplx_mat_mult_2x2_f32_iq` | Complex-valued 2x2 matrix multiplication | 32-bit floating point         | Separate I and Q arrays
 `armral_cmplx_vecdot_i16_32bit`    | Complex-valued vector dot-product        | signed 16-bit integer         | 32-bit accumulator
 
-## Directory structure
+## ArmRAL directory structure
 
-The directory structure of Arm RAL is:
+The directory structure of ArmRAL is:
 
 ```
 +-- CMakeLists.txt
@@ -93,9 +95,12 @@ The directory structure of Arm RAL is:
 +-- RELEASE_NOTES.md
 +-- THIRD_PARTY_LICENSES.md
 +-- bench
-|   +-- CRC
-|       +-- bench.py
-|       +-- main.cpp
+|   +-- BasicMathFun
+|      +-- MatrixInv
+|         +-- Single
+|            +-- GeneralMatInv
+|               +-- bench.py
+|               +-- main.cpp
 |   +-- ...
 +-- docs
 |   +-- ...
@@ -113,8 +118,10 @@ The directory structure of Arm RAL is:
 |       +-- ...
 |   +-- ...
 +-- test
-|   +-- CRC
-|       +-- main.cpp
+|   +-- BasicMathFun
+|      +-- MatrixInv
+|         +-- Single
+|            +-- main.cpp
 |   +-- ...
 +-- utils
 |   +-- ...
@@ -142,13 +149,13 @@ the custom allocators defined in `src/utils/allocators.hpp`. These
 offer two advantages:
 
 1. Developers do not need to ensure dynamically-allocated memory is
-freed after use.
+   freed after use.
 
 2. All user-facing functions (defined in `include/armral.h`) that need
-to allocate memory internally must also provide a non-allocating
-version that allows users to pass in a pre-allocated buffer. Using Arm
-RAL's custom allocators simplifies writing these variants because they
-offer a counting allocator in addition to one that uses `malloc`.
+   to allocate memory internally must also provide a non-allocating
+   version that allows users to pass in a pre-allocated buffer. Using
+   ArmRAL's custom allocators simplifies writing these variants because they
+   offer a counting allocator in addition to one that uses `malloc`.
 
 C-style variable length arrays (VLAs) can only be used in the FFT
 functions (`armral/src/LowerPHY/FFT`).
@@ -156,7 +163,7 @@ functions (`armral/src/LowerPHY/FFT`).
 ### Namespaces
 
 All symbols in the library must be clearly identified as coming from
-Arm RAL. User-facing functions specified in `include/armral.h` are
+ArmRAL. User-facing functions specified in `include/armral.h` are
 identified by the prefix `armral_`. Using C++ enables us to enclose
 other library functions in namespaces. These namespaces must begin
 with `armral::` and can themselves contain further namespaces to
@@ -168,7 +175,7 @@ the `static` keyword.
 
 ### No dependency on C++ standard library at runtime
 
-We require that Arm RAL does not have a dependency on the C++ runtime
+We require that ArmRAL does not have a dependency on the C++ runtime
 library as this enables `libarmral` to be linked against on systems
 that do not have the C++ runtime library installed. This means that
 constructs like `std::vector` must not be used by functions in the
@@ -181,7 +188,7 @@ constructs in testing and benchmarking code: for example,
 
 Documentation for each user-facing function is written as a Doxygen
 comment immediately preceding the function's prototype in
-`include/armral.h`. Arm RAL uses the Javadoc style, which is a C-style
+`include/armral.h`. ArmRAL uses the Javadoc style, which is a C-style
 multi-line comment that starts with `/**`:
 
 ```c
@@ -252,7 +259,7 @@ C/C++ code style is maintained through the use of `clang-format` and
 patch; instructions on how to run these tools are given below.
 
 `clang-format` and `clang-tidy` are part of the [LLVM
-Project](https://llvm.org/). Arm RAL is tested with version 17.0.0 of
+Project](https://llvm.org/). ArmRAL is tested with version 17.0.0 of
 the tools.
 
 Matching your coding style as close as possible to the `clang-tidy`
@@ -262,26 +269,28 @@ enforce:
 - Use snake case for names of variables and functions,
   i.e. `this_is_a_variable` instead of `thisIsAVariable`.
 
--  Symbol names start with a lower case letter. This means that `_m`
-   for a member variable, for example, will not be accepted.
+- Symbol names start with a lower case letter. This means that `_m`
+  for a member variable, for example, will not be accepted.
 
--  Always use curly braces for single line `if` statements, `for` loops
-   and `while` loops.
+- Always use curly braces for single line `if` statements, `for` loops
+  and `while` loops.
 
--  Opening curly braces for `if` statements, `for` loops and `while`
-   loops are on the same line as the `if`, `for` or `while`.
+- Opening curly braces for `if` statements, `for` loops and `while`
+  loops are on the same line as the `if`, `for` or `while`.
 
--  Closing curly braces are the first non-white-space character on a
-   new line. Their alignment must match the first character of the
-   matching `if`/`for`/`while` statement. `else` statements are on the
-   same line as a closing curly brace for the corresponding `if` or `else
-   if` statement.
+- Closing curly braces are the first non-white-space character on a
+  new line. Their alignment must match the first character of the
+  matching `if`/`for`/`while` statement. `else` statements are on the
+  same line as a closing curly brace for the corresponding `if` or `else
+  if` statement.
 
 ### Running clang-format
 
 Run `clang-format` on the current commit with:
 
-    git clang-format HEAD~
+```
+git clang-format HEAD~
+```
 
 This will correctly format any files modified in the current
 commit. You must then update your commit with the reformatted files.
@@ -292,10 +301,12 @@ Before running `clang-tidy` you must compile the library with an LLVM
 compiler, i.e. `clang` and `clang++`, and tell CMake to write out the
 compilation commands by setting `-DCMAKE_EXPORT_COMPILE_COMMANDS=On`:
 
-    mkdir <build>
-    cd <build>
-    cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DBUILD_TESTING=On <path>
-    make
+```
+mkdir <build>
+cd <build>
+cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DBUILD_TESTING=On <path>
+make
+```
 
 Substituting:
 
@@ -306,8 +317,10 @@ Substituting:
 
 Then run `clang-tidy` with a list of files to check:
 
-    cd <build>
-    clang-tidy -p <build> <file_1> <file_2> ... <file_N> -header-filter=.*
+```
+cd <build>
+clang-tidy -p <build> <file_1> <file_2> ... <file_N> -header-filter=.*
+```
 
 where `<file_X>` is the path to a modified file in the library
 source. Fix any errors and update your commit with the modified files.
@@ -317,11 +330,15 @@ source. Fix any errors and update your commit with the modified files.
 Python code style is maintained through the use of the `flake8`
 linter. Install `flake8` using `pip`:
 
-    pip install flake8
+```
+pip install flake8
+```
 
 and run it on an individual Python file:
 
-    python -m flake8 --config=<path>/flake8.txt <filename>
+```
+python -m flake8 --config=<path>/flake8.txt <filename>
+```
 
 Where:
 
@@ -333,6 +350,24 @@ This will produce a list of errors, which you must fix manually. Once
 you have rerun `flake8` and it does not report any errors, add your
 updated Python file to the current commit.
 
+## CMake code style
+
+CMake code style is maintained through the use of the `cmake-format`
+tool. Install `cmake-format` using `pip`:
+
+```
+pip install cmake-format
+```
+
+and run it on an individual `CMakeLists.txt` file:
+
+```
+cmake-format -i CMakeLists.txt
+```
+
+This will correctly format the specified file. You must then update
+your commit with the reformatted file.
+
 ## Writing tests
 
 Each function with a prototype in `armral.h` must be accompanied by a
@@ -342,7 +377,7 @@ preferably a separate reimplementation of the function. In some
 situations it may be necessary to compare against arrays of constant
 values instead but this should be avoided wherever possible.
 
-Arm RAL tests must exercise every path through the function that leads
+ArmRAL tests must exercise every path through the function that leads
 to a successful exit. Setting the CMake variable
 `ARMRAL_ENABLE_COVERAGE=On` enables the compiler flags needed to
 visualize code coverage with [gcovr](https://gcovr.com/en/stable/).
@@ -356,8 +391,10 @@ In the top-level `CMakeLists.txt` add an `add_armral_test()` entry
 pointing to the source file for the tests. The source-code for the
 test must be placed in a subdirectory of `<path>/test`, where `<path>`
 is the root directory of the library source. Usually the source for
-all the tests of a single Arm RAL function is contained in a single
-`main.cpp` file.
+all the tests of a single ArmRAL function is contained in a single
+`main.cpp` file. The tests should be added to `CmakeLists.txt` in
+alphabetical order grouped by the directories, e.g. all UpperPHY
+tests are grouped together in alphabetical order.
 
 Successful tests must return `EXIT_SUCCESS` from the `main()`
 function; failing tests must return `EXIT_FAILURE`.
@@ -368,7 +405,7 @@ It is recommended to use
 [AddressSanitizer](https://clang.llvm.org/docs/AddressSanitizer.html)
 to test your patches for memory errors as patches will not be accepted
 unless this passes. Setting the CMake variable `ARMRAL_ENABLE_ASAN=On`
-enables the flags needed to compile and link Arm RAL and its tests
+enables the flags needed to compile and link ArmRAL and its tests
 with AddressSanitizer. The `make check` target will then run the tests
 using AddressSanitizer and will fail if an error is detected.
 
@@ -415,7 +452,7 @@ The following code block provides a template for the `bench.py` script.
 ```py
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
@@ -450,20 +487,20 @@ Items in angle brackets `< >` are changed as appropriate according to the
 following descriptions.
 
 - `<bench_exe_name>`: The name of the executable, e.g.
-`bench_mu_law_compression_8bit` (see [Naming scheme](#naming-scheme)).
+  `bench_mu_law_compression_8bit` (see [Naming scheme](#naming-scheme)).
 
 - `<number of reps>`: The number of times the case should be run for
-profiling (see [Number of repetitions](#number-of-repetitions)).
+  profiling (see [Number of repetitions](#number-of-repetitions)).
 
 - `<list of argument values>`: The arguments that will be required in order
-to run the function that is to be benchmarked. This can be a list of individual
-elements, or can, for example, be a list of tuples if multiple arguments are
-required for each case. The length of the list determines how many cases are
-generated. See [Number of cases](#number-of-cases) for guidance on how many
-cases there should be.
+  to run the function that is to be benchmarked. This can be a list of individual
+  elements, or can, for example, be a list of tuples if multiple arguments are
+  required for each case. The length of the list determines how many cases are
+  generated. See [Number of cases](#number-of-cases) for guidance on how many
+  cases there should be.
 
 - `<function>`: A snake case string to identify the function being
-benchmarked for a particular case, e.g. `mu_law_compression_8bit`.
+  benchmarked for a particular case, e.g. `mu_law_compression_8bit`.
 
 - `<arg0, arg1, ...>`: The arguments in the argument list.
 
@@ -481,7 +518,7 @@ The following code block provides a basic template.
 ```cpp
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -535,40 +572,40 @@ The items in angle brackets `< >` are changed as appropriate according to the
 following descriptions.
 
 - `<run_function_perf>`: The name of the function that repeatedly calls the
-function being benchmarked, e.g. `run_mu_law_compression_8bit_perf` (see
-[Naming scheme](#naming-scheme)).
+  function being benchmarked, e.g. `run_mu_law_compression_8bit_perf` (see
+  [Naming scheme](#naming-scheme)).
 
 - `<type_arg0>`, `<type_arg1>`: The types of the arguments which are passed
-in on the command line.
+  in on the command line.
 
 - `<FUNC DESCRIPTOR>`: An uppercase string to identify the function, e.g.
-`"MU LAW COMPRESSION 8BIT"`.
+  `"MU LAW COMPRESSION 8BIT"`.
 
 - `<arg0_description>`, `<arg1_description>`: Descriptions to identify the
-arguments when printing.
+  arguments when printing.
 
 - `<arg0_specifier>`, `<arg1_specifier>`: The format specifiers for printing
-the arguments.
+  the arguments.
 
 - `<type_var0>`, `<type_var1>`: The types of the variables defined locally
-in `<run_function_perf>`.
+  in `<run_function_perf>`.
 
 - `<var0>`, `<var1>`: The names of variables defined locally in
-`<run_function_perf>`.
+  `<run_function_perf>`.
 
 - `<armral_func>`: The name of the library function being benchmarked (e.g.
-`armral_mu_law_compr_8bit`).
+  `armral_mu_law_compr_8bit`).
 
 - `<num_args>`: The number of arguments which are passed to the executable
-on the command line. This is equal to the number of arguments in the `args`
-field of the JSON object + 1 (since the filename is the first argument).
+  on the command line. This is equal to the number of arguments in the `args`
+  field of the JSON object + 1 (since the filename is the first argument).
 
 - `<arg0>`, `<arg1>`: The names of the arguments which are passed to the
-executable on the command line. These are the names of the arguments provided
-in the `args` field of the JSON object generated by `bench.py`.
+  executable on the command line. These are the names of the arguments provided
+  in the `args` field of the JSON object generated by `bench.py`.
 
 - `<description of arg0>`, `<description of arg1>`: A description of each
-command line argument.
+  command line argument.
 
 ##### Outputs
 
@@ -587,7 +624,9 @@ Once the new `main.cpp` file has been created, an entry must be added to
 
 where `<name>` is the `exe_name` without `bench_` at the front (e.g.
 `mu_law_compression_8bit`). The entry goes with the other benchmark
-entries as part of the `if(BUILD_TESTING)` logic.
+entries as part of the `if(BUILD_TESTING)` logic. The benchmarks should be
+added in alphabetical order to `CMakeLists.txt` grouped by the directories,
+e.g. all UpperPHY benchmarks are grouped together in alphabetical order.
 
 #### Directory structure
 
@@ -595,7 +634,7 @@ Benchmarks for different functions should be separated into different
 files. For example, for Mu Law compression and decompression there are
 different functions for 8-bit, 9-bit and 14-bit (de)compression. These
 should be in separate benchmarking executables. The Mu Law directory
-structure in `bench` therefore looks like:
+structure in `bench/DuRuInterface` therefore looks like:
 
 ```
 +-- MuLaw
diff --git a/CREDITS.md b/CREDITS.md
index 467a1b1..0271d77 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -5,33 +5,31 @@ Acceleration Library:
 - Work on `armral_ldpc_rate_recovery` to correctly set the
   log-likelihood ratios of filler bits was contributed upstream by
   4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/6.
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/6>.
 
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support the addition and removal of filler bits when the soft
   buffer size is less than the full buffer size was contributed
   upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/5.
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/5>.
 
 - Work on `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
   `armral_ldpc_rate_recovery` to support the addition and removal of
   filler bits when the code block size is not a multiple of lifting
   set size was contributed upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/4
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/4>.
 
 - Work on `armral_seq_generator` to extend the `sequence_len`
   parameter to `uint32_t` was contributed upstream by 4g5g
   Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/3
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/3>.
 
 - Work on `armral_polar_rate_matching` and
   `armral_polar_rate_recovery` to enable or disable bit interleaving
   was contributed upstream by 4g5g Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/2
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/2>.
 
 - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
   to support soft buffer sizes was contributed upstream by 4g5g
   Consultants. See
-  https://gitlab.arm.com/networking/ral/-/merge_requests/1
-
-
+  <https://gitlab.arm.com/networking/ral/-/merge_requests/1>.
diff --git a/Doxyfile.in b/Doxyfile.in
index c470dc9..f571d32 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Arm RAN Acceleration Library Reference Guide"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.01"
+PROJECT_NUMBER         = "24.04"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/LICENSE.md b/LICENSE.md
index 10ce6d4..e511299 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/README.md b/README.md
index bcaea5e..6d90bff 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,25 @@
 # Get started with Arm RAN Acceleration Library (ArmRAL)
 
-Describes how to build, install, run tests and benchmarks, and uninstall Arm RAN
-Acceleration Library (ArmRAL).
+This document describes how to build, install, run tests and
+benchmarks, and uninstall Arm RAN Acceleration Library (ArmRAL).
 
-# Before you begin
+## Introducing Arm RAN Acceleration Library
 
-If you have not already downloaded Arm RAN Acceleration library, visit
-https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download
-to download the source code.
+Arm RAN Acceleration Library provides optimized signal processing and related
+maths functions for enabling 5G Radio Access Network (RAN) deployments. It
+leverages the efficient vector units available on Arm cores that support the
+Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads,
+including:
+
+* Matrix and vector arithmetic, such as matrix multiplication.
+* Fast Fourier Transforms (FFTs).
+* Digital modulation and demodulation.
+* Cyclic Redundancy Check (CRC).
+* Encoding and decoding schemes, including Polar, Low-Density Parity
+  Check (LDPC), and Turbo.
+* Compression and decompression.
+
+## Before you begin
 
 * Ensure you have installed all the tools listed in the **Tools** section of the
   `RELEASE_NOTES.md` file.
@@ -16,7 +28,7 @@ to download the source code.
   the PMULL extension, pmull is listed under the **Features** list given in the
   `/proc/cpuinfo` file.
 
-# Build Arm RAN Acceleration Library (ArmRAL)
+## Build Arm RAN Acceleration Library (ArmRAL)
 
 1. Configure your environment. If you have multiple compilers installed on your
    machine, you can set the `CC` and `CXX` environment variables to the path to
@@ -56,14 +68,14 @@ to download the source code.
 
    Notes:
 
-   * The `-DBUILD_TESTING=On` and `-DBUILD_EXAMPLES=On` options are optional,
-     but are required if you want to run the library tests (`-DBUILD_TESTING`)
-     and benchmarks (`-DBUILD_EXAMPLES`).
+   * The `-DBUILD_TESTING=On` and `-DBUILD_EXAMPLES=On` options are required
+     if you want to run the library tests (`-DBUILD_TESTING`) and benchmarks
+     (`-DBUILD_EXAMPLES`).
 
-   * The `-DCMAKE_INSTALL_PREFIX=<install-dir>` option is optional and
-     specifies the base directory used to install the library. The library
-     archive is installed to `<install-dir>/lib` and headers are installed to
-     `<install-dir>/include`. The default location is `/usr/local`.
+   * The `-DCMAKE_INSTALL_PREFIX=<install-dir>` option specifies the base
+     directory used to install the library. The library archive is installed to
+     `<install-dir>/lib` and headers are installed to `<install-dir>/include`.
+     The default location is `/usr/local`.
 
    * By default, a static library is built. To build a dynamic or a static
      library use the `-DBUILD_SHARED_LIBS={On|Off}` option.
@@ -214,7 +226,7 @@ to download the source code.
 
        Default is `On`.
 
-# Install Arm RAN Acceleration Library (ArmRAL)
+## Install Arm RAN Acceleration Library (ArmRAL)
 
 After you have built Arm RAN Acceleration Library, you can install the library.
 
@@ -235,7 +247,7 @@ After you have built Arm RAN Acceleration Library, you can install the library.
    directory. `install_manifest.txt` lists the installation locations for the
    library and the header files.
 
-# Run the tests
+## Run the tests
 
 The Arm RAN Acceleration Library package includes tests for the available
 functions in the library.
@@ -247,9 +259,6 @@ To build and run the tests, use:
 
     make check
 
-The tests run and test the available functions in the library. Testing
-times vary from system to system, but typically only take a few seconds.
-
 If you are not developing on an AArch64 machine, or if you want to test the SVE
 or SVE2 version of the library on an AArch64 machine that does not support the
 extension, you can use the `-DARMRAL_TEST_RUNNER` option to prefix each test
@@ -260,7 +269,7 @@ prefix the tests with `qemu-aarch64` using:
     cmake .. -DBUILD_TESTING=On -DARMRAL_TEST_RUNNER=qemu-aarch64
     make check
 
-# Run the benchmarks
+## Run the benchmarks
 
 All the functions in Arm RAN Acceleration Library contain benchmarking code
 that contains preset problem sizes.
@@ -274,10 +283,21 @@ To build and run the benchmarks, use:
 
     make bench
 
-Benchmark results print as JSON objects. To further process the results, you
-can collect the results to a file or pipe the results into other scripts.
+Benchmark results print as JSON objects. To further process the results, you can
+collect the results to a file or pipe the results into other scripts.
+Alternatively, the Makefile target:
+
+    make bench_excel_summary
+
+will run the benchmarks and produce an Excel spreadsheet of the results, in
+addition to printing them as JSON objects. To install the required Python
+packages for this target, use:
+
+    pip install -r <path>/python/requirements.txt
+
+where `<path>` is the path to the root directory of the library source.
 
-# Run the examples
+## Run the examples
 
 The source for the example programs is available in the `examples` directory,
 found in the ArmRAL root directory.
@@ -301,7 +321,7 @@ More information about the examples that are available in Arm RAN Acceleration
 Library, and how to use the library in general, is available in
 **Use Arm RAN Acceleration Library (ArmRAL)** (see `examples.md`).
 
-# Run the simulations
+## Run the simulations
 
 You can evaluate the quality of the error correction of the different encoding schemes
 against the signal-to-noise ratio using a set of noisy channel simulation
@@ -331,11 +351,11 @@ directory.
 More information about the simulation programs that are available in Arm RAN
 Acceleration Library is available in `simulation/README.md`.
 
-# Code coverage
+## Code coverage
 
 You can generate information that describes how much of the library is used by
 your application, or is covered by the included tests. To collect code coverage
-information, you must have built Arm RAN Acceleration Library  with
+information, you must have built Arm RAN Acceleration Library with
 `-DARMRAL_ENABLE_COVERAGE=On`.
 
 An example workflow could be:
@@ -361,11 +381,11 @@ update to a newer version of `gcovr`. To find out what versions of `gcovr` have
 been tested with ArmRAL, see the **Tools** section of the `RELEASE_NOTES.md`
 file.
 
-# Documentation
+## Documentation
 
 The Arm RAN Acceleration Library Reference Guide is available online at:
 
-    https://developer.arm.com/documentation/102249/2401
+    <https://developer.arm.com/documentation/102249/2404>
 
 If you have Doxygen installed on your system, you can build a local HTML version
 of the Arm RAN Acceleration Library documentation using CMake.
@@ -377,7 +397,7 @@ To build the documentation, run:
 The HTML builds and is output to `docs/html/`. To view the documentation, open
 the `index.html` file in a browser.
 
-# Uninstall Arm RAN Acceleration Library
+## Uninstall Arm RAN Acceleration Library
 
 To uninstall Arm RAN Acceleration Library:
 
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index eee0c45..7830b73 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,4 +1,4 @@
-# Arm RAN Acceleration Library 24.01 Release Note
+# Arm RAN Acceleration Library 24.04 Release Note
 
 Non-Confidential
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
@@ -7,7 +7,7 @@ Arm conventions and proprietary notices, including confidentiality status,
 terminology statement, and product release status, can be found at the end of
 this document.
 
-# Contents
+## Contents
 
 - Release overview
 - Release contents
@@ -16,38 +16,32 @@ this document.
 - Conventions
 - Proprietary notices
 
-# Release overview
+## Release overview
 
 The following sections describe the product that this release note describes and
 its quality status at time of release.
 
-Use of Arm RAN Acceleration Library is subject to a BSD-3-Clause license, the
-text of which can be found in the `LICENSE.md` file in your product
-installation. We will receive inbound contributions under the same license.
-
-## Product description
+### Product description
 
 The Arm RAN Acceleration Library (ArmRAL) contains a set of functions for
 accelerating telecommunications applications such as, but not limited to, 5G
-Radio Access Networks (RANs).
-
-The Arm RAN Acceleration Library 24.01 package provides a library that is
-optimized for Arm AArch64-based processors.
+Radio Access Networks (RANs). These functions are optimized for Arm AArch64-based
+processors.
 
 Arm RAN Acceleration Library provides:
 
 - Vector functions
 - Matrix functions
-- Lower PHY support functions
-- Upper PHY support functions
-- DU-RU Interface support functions
+- Lower physical layer (Lower PHY) support functions
+- Upper physical layer (Upper PHY) support functions
+- Distributed Unit-Radio Unit (DU-RU) Interface support functions
 
 Arm RAN Acceleration Library includes functions that operate on 16-bit signed
-integers and 32-bit floating-point values.
+integers and 16-bit and 32-bit floating-point values.
 
-## Release Status
+### Release status
 
-This is the 24.01 release of Arm RAN Acceleration Library.
+This is the 24.04 release of Arm RAN Acceleration Library.
 
 These deliverables are being released under the terms of the agreement between
 Arm and each licensee (the "Agreement"). All planned verification and
@@ -55,7 +49,7 @@ validation is complete.
 
 The release is suitable for volume production under the terms of the Agreement.
 
-## Licensing information
+### Licensing information
 
 Use of Arm RAN Acceleration Library is subject to a BSD-3-Clause license, the
 text of which can be found in the `LICENSE.md` file in your product
@@ -64,56 +58,35 @@ installation. We will receive inbound contributions under the same license.
 If you require a different license than BSD-3-Clause for compatibility with
 your end product, please get in contact.
 
-# Release contents
+## Release contents
 
 Arm RAN Acceleration Library releases contain documentation and source files.
 
 The following subsections describe:
 
-- Downloading and unpacking the product.
+- Cloning the product's git repository from Arm's Gitlab.
 - The contents of this release.
 - Any changes since the previous release.
 - Any known issues and limitations that exist at the time of this release.
 
-## Downloading and unpacking
-
-You can either clone the source as a git repository from Arm's Gitlab,
-or you can download Arm RAN Acceleration Library as a tarball of
-source from the Arm Developer website and then unpack the contents.
-
-**To clone the Arm RAN Acceleration Library repository via SSH:**
-
-    git clone git@git.gitlab.arm.com:networking/ral.git
-
-**To clone the Arm RAN Acceleration Library repository via HTTPS:**
-
-    git clone https://git.gitlab.arm.com/networking/ral.git
+### Cloning the source repository
 
-**To download the tarball and unpack the contents:**
+**To obtain the 24.04 release of Arm RAN Acceleration Library by cloning
+  the repository via HTTPS:**
 
-1. Go to https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download.
+    git clone -b armral-24.04 https://git.gitlab.arm.com/networking/ral
 
-2. Complete the form and click **Submit**. The package downloads.
-
-3. Locate the downloaded .tar.gz file.
-
-4. Copy the .tar.gz file to the directory where these files are to be built.
-
-5. Extract the tar file contents using a tar utility:
-
-    tar zxvf ral-armral-24.01.tar.gz
-
-## Deliverables
+### Deliverables
 
 The downloaded product includes the deliverables listed in this section.
 
-- Arm RAN Acceleration Library 24.01
+- Arm RAN Acceleration Library 24.04
 - Release Notes (this document)
 - Documentation
 
   Product documentation is available on the Arm Developer website at:
 
-    https://developer.arm.com/documentation/102249/2401
+    <https://developer.arm.com/documentation/102249/2404>
 
   **Note:** Documentation, errata and release notes might change between product
   releases. For the latest documentation bundle, check the product download
@@ -122,77 +95,67 @@ The downloaded product includes the deliverables listed in this section.
   **Note:** Arm tests its PDFs only in Adobe Acrobat and Acrobat Reader. Arm
   cannot guarantee the quality of this document when used with any other PDF
   reader. A suitable PDF reader can be downloaded from Adobe at
-  http://www.adobe.com.
+  <http://www.adobe.com>.
 
-## Differences from previous release
+### Differences from previous release
 
 The following subsections describe differences from the previous release of
 Arm RAN Acceleration Library.
 
-### Additions and functionality changes
+#### Additions and functionality changes
 
 Describes new features or any technical changes to features or
 components in this release.
 
-- Added support for the addition and removal of filler bits in
-  `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
-  `armral_ldpc_rate_recovery` when the code block size is not a
-  multiple of lifting set size or when the soft buffer size is less
-  than the full buffer size. This process is described in the 3GPP
-  Technical Specification (TS) 38.212. This work was contributed
-  upstream by 4g5g Consultants.
-
 - Extended `armral_cmplx_pseudo_inverse_direct_f32` and
   `armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the
-  regularized pseudo-inverse of a single complex 32-bit matrix of size
-  `M-by-N` for cases where `M > N` in addition to the cases where `M
-  <= N`.
+  regularized pseudo-inverse of a complex 32-bit matrix of size
+  `M-by-N` for the case where `M` and/or `N == 1`.
+
+- Added a Makefile target `bench_excel_summary` to run the benchmarks
+  and create an Excel spreadsheet containing the results.
 
-### Performance improvements
+#### Performance improvements
 
 Describes any features or components whose performance has improved in
 the current release compared with the previous release.
 
-- Performance improvements for the following routines:
-
-  * `armral_turbo_decode_block` and `armral_turbo_decode_block_noalloc`.
-
 - Performance improvements for SVE2 implementations of the following routines:
 
-  * `armral_seq_generator`, for the cases when `sequence_len` is not a
-    multiple of 64.
+   - `armral_turbo_decode_block` and
+     `armral_turbo_decode_block_noalloc`. These functions now operate
+     internally on 16-bit floating point values rather than 32-bit
+     floating point values.
+
+   - `armral_ldpc_encode_block` and
+     `armral_ldpc_encode_block_noalloc`.
 
-### Changes to simulation programs
+#### Changes to simulation programs
 
 Describes any changes, new features or components added to the channel
 simulation programs in this release.
 
-- Added support for the addition and removal of filler bits in
-  `ldpc_awgn` when the code block size is not a multiple of lifting
-  set size. This work was contributed upstream by 4g5g Consultants.
+- There are no changes to the channel simulation programs in this
+  release.
 
-### Resolved issues
+#### Resolved issues
 
 Describes any known issues resolved in the current release.
 
-- LDPC block encoding (`armral_ldpc_encode_block`), rate matching
-  (`armral_ldpc_rate_matching`) and rate recovery
-  (`armral_ldpc_rate_recovery`) now support the insertion and removal
-  of filler bits as described in the 3GPP Technical Specification (TS)
-  38.212.
+- There are no known issues resolved in this release.
 
-## Known limitations
+### Known limitations
 
 Describes any known limitations of the current release.
 
 - There are no known limitations in this release.
 
-# Support
+## Support
 
-If you have any issues with the installation, content or use of this release,
-raise a question on the Developer Community Forum:
+If you have any issues with the installation, content, or use of this
+release, raise a question on the Developer Community Forum:
 
-  https://community.arm.com/developer/f/infrastructure-solutions
+  <https://community.arm.com/developer/f/infrastructure-solutions>
 
 Arm will respond as soon as possible.
 
@@ -203,37 +166,35 @@ A Full release of the Arm Deliverable shall have met the contractual requirement
 for verification and validation of the deliverable subject to any waivers agreed
 between Arm and the Customer.
 
-## Tools
+### Tools
 
-The following points list the tools that are required to build or run Arm RAN
-Acceleration Library:
+To build or run Arm RAN Acceleration Library you will need:
 
-* A recent version of a C/C++ compiler, such as GCC. Arm RAN
-  Acceleration Library has been tested with GCC 7.5.0, 8.5.0, 9.5.0,
-  10.5.0, 11.4.0, 12.3.0, and 13.2.0.
+- A C/C++ compiler, such as GCC. Arm RAN Acceleration Library has been tested
+  with GCC 7.5.0, 8.5.0, 9.5.0, 10.5.0, 11.4.0, 12.3.0, and 13.2.0.
 
   **Note:** If you are cross-compiling, you need a cross-toolchain compiler that
   targets AArch64. You can download open-source cross-toolchain builds of the
   GCC compiler on the Arm Developer website:
 
-    https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+    <https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads>
 
   The variant to use for an AArch64 GNU/Linux target is
   `aarch64-none-linux-gnu`.
 
-* A recent version of CMake (version 3.3.0, or higher).
+- CMake version 3.3.0 or higher.
 
-In addition to the preceding requirements:
+Additionally:
 
-* To run the benchmarks, you must have the Linux utility tool `perf` installed
+- To run the benchmarks, you must have the Linux utility tool `perf` installed
   and a recent version of Python 3. Arm RAN Acceleration Library has been tested
   with Python 3.8.5.
 
-* To build a local version of the documentation, you must have Doxygen
+- To build a local version of the documentation, you must have Doxygen
   installed. Arm RAN Acceleration Library has been tested with Doxygen version
   1.8.13.
 
-* To generate code coverage HTML pages, you must have `gcovr` installed. The
+- To generate code coverage HTML pages, you must have `gcovr` installed. The
   library has been tested with `gcovr` version 4.2.
 
 **Note:** Arm RAN Acceleration Library runs on AArch64 cores, however
@@ -242,27 +203,27 @@ functions you must run on a core that supports the AArch64 PMULL
 extension. If your machine supports the PMULL extension, `pmull` is
 listed under the "Features" list given in the `/proc/cpuinfo` file.
 
-# Release history
+## Release history
 
 A full release history (with release notes) for Arm RAN Acceleration Library
 is available on the Arm Developer website:
 
-  https://developer.arm.com/downloads/-/arm-ran-acceleration-library/previous-releases-of-the-arm-ran-acceleration-library
+  <https://developer.arm.com/downloads/-/arm-ran-acceleration-library/previous-releases-of-the-arm-ran-acceleration-library>
 
-# Conventions
+## Conventions
 
 The following subsections describe conventions used in Arm documents.
 
-## Glossary
+### Glossary
 
 The Arm Glossary is a list of terms that are used in Arm documentation, together
 with definitions for those terms. The Arm Glossary does not contain terms that
 are industry standard unless the Arm meaning differs from the generally accepted
 meaning.
 
-See the Arm Glossary for more information: https://developer.arm.com/glossary.
+See the Arm Glossary for more information: <https://developer.arm.com/glossary>.
 
-# Non-Confidential Proprietary Notice
+## Non-Confidential Proprietary Notice
 
 This document is protected by copyright and other related rights and the
 practice or implementation of the information contained in this document may be
@@ -310,7 +271,7 @@ The Arm corporate logo and words marked with ® or ™ are registered trademarks
 trademarks of Arm Limited (or its affiliates) in the US and/or elsewhere. All
 rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm’s trademark usage
-guidelines at https://www.arm.com/company/policies/trademarks.
+guidelines at <https://www.arm.com/company/policies/trademarks>.
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
@@ -318,7 +279,7 @@ Arm Limited. Company 02557590 registered in England.
 110 Fulbourn Road, Cambridge, England CB1 9NJ.
 (LES-PRE-20349)
 
-## Confidentiality Status
+### Confidentiality Status
 
 This document is Non-Confidential. The right to use, copy and disclose this
 document may be subject to license restrictions in accordance with the terms of
@@ -327,15 +288,15 @@ to.
 
 Unrestricted Access is an Arm internal classification.
 
-## Product Status
+### Product Status
 
 The information in this document is Final, that is for a developed product.
 
-## Web Address
+### Web Address
 
-https://developer.arm.com
+<https://developer.arm.com>
 
-## Inclusive language commitment
+### Inclusive language commitment
 
 Arm values inclusive communities. Arm recognizes that we and our industry have
 used language that can be offensive. Arm strives to lead the industry and create
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
index 74414bb..1770d5a 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
index 32847b4..8375897 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
index aeaf28c..51d5ad7 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
index 0f27c83..64a2b4f 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
index 8710c18..2cb8056 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
similarity index 93%
rename from bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
index 8c66a87..8d0e972 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
similarity index 86%
rename from bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
index eaf5b9b..426c494 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
similarity index 94%
rename from bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
rename to bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
index 6536ea8..cd8a7a6 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/GeneralMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
similarity index 82%
rename from bench/MatrixInv/Single/GeneralMatInv/bench.py
rename to bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
index 3903a1b..369ee50 100755
--- a/bench/MatrixInv/Single/GeneralMatInv/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/GeneralMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
similarity index 90%
rename from bench/MatrixInv/Single/GeneralMatInv/main.cpp
rename to bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
index 94e63e0..2509ad5 100644
--- a/bench/MatrixInv/Single/GeneralMatInv/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/HermitianMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
similarity index 82%
rename from bench/MatrixInv/Single/HermitianMatInv/bench.py
rename to bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
index f0d7e5b..af79e2e 100755
--- a/bench/MatrixInv/Single/HermitianMatInv/bench.py
+++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/HermitianMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
similarity index 91%
rename from bench/MatrixInv/Single/HermitianMatInv/main.cpp
rename to bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
index c9b708f..de9d111 100644
--- a/bench/MatrixInv/Single/HermitianMatInv/main.cpp
+++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
index e7edca6..5fdd983 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
index 9b0453f..96216c6 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
index d190973..51e0638 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
index 1b8bd8b..6c83b1a 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
index aee2667..4d8f7fb 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
index b554c1f..2fd1c77 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
index 9dd2030..6c0e4a8 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
index 7637055..24ab935 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
similarity index 84%
rename from bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
index ee929f2..1a73cc6 100755
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
index 4082649..2052289 100644
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
index 8549eab..ae47493 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
index 6d0006f..f29d40b 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
index 4ecaa28..726a768 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
index 85b3f96..19d7ced 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
index 621a7a0..d42117e 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
index 63034a6..0c987fc 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
index 5e88789..d1dee61 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
index bdfbd19..1df1087 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
index 0cfde51..ca14a86 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
similarity index 94%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
index 3344fe6..3ea7a1c 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
similarity index 86%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
index 78f0bae..f10ae62 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
similarity index 95%
rename from bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
rename to bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
index 0aa6934..dee1b72 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
similarity index 82%
rename from bench/MatrixMult/Single/MatrixMult16/32b/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
index 5e4312e..94584d5 100755
--- a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
index 3462e61..68baf3a 100644
--- a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
similarity index 82%
rename from bench/MatrixMult/Single/MatrixMult16/64b/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
index ef4ad39..0be0f7a 100755
--- a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
similarity index 90%
rename from bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
index a1ebda8..098fc26 100644
--- a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
index 6125bad..1ee0755 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
index 890a174..d3ceec3 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
index f58fc63..3cb7d5d 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
index 74ab2b7..060fe7e 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
index c7dd1f9..e79186f 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
index 3ccd0ce..abf84cd 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
similarity index 79%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
index 626b618..9c51504 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
index d0eb869..a73a074 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
similarity index 87%
rename from bench/MatrixMult/Single/MatrixMult32/general/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
index cae82ac..f0c6ae0 100755
--- a/bench/MatrixMult/Single/MatrixMult32/general/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
similarity index 93%
rename from bench/MatrixMult/Single/MatrixMult32/general/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
index f37000d..a2f6657 100644
--- a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
similarity index 85%
rename from bench/MatrixMult/Single/MatrixMultAAH32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
index c911c26..1dca6b0 100755
--- a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
index 5ebdcf1..949c879 100644
--- a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
similarity index 91%
rename from bench/MatrixMult/Single/MatrixMultAHB32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
index 9a58a3d..71916e5 100755
--- a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
index 1fee878..873a3e3 100644
--- a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
similarity index 84%
rename from bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
index c15b3ab..d2d7e58 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
index 63e49a2..3d21c23 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
similarity index 84%
rename from bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
index 4ada38d..be7d58d 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
similarity index 92%
rename from bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
index 8c0c7b1..5481fac 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
similarity index 87%
rename from bench/MatrixMult/Single/MatrixVectorMult32/bench.py
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
index fb5e762..3a6ee40 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
similarity index 89%
rename from bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
rename to bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
index 9cc90ba..07a22a3 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
+++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixPseudoInv/Direct/bench.py b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
similarity index 81%
rename from bench/MatrixPseudoInv/Direct/bench.py
rename to bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
index fcfb462..56c99b4 100755
--- a/bench/MatrixPseudoInv/Direct/bench.py
+++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
@@ -20,7 +20,7 @@ j = {
     "cases": []
 }
 
-size1 = [2, 3, 4, 8, 16]
+size1 = [1, 2, 3, 4, 8, 16]
 size2 = [32, 64, 128, 256]
 
 for (m, n) in itertools.chain(zip(size1, size2), zip(size2, size1)):
diff --git a/bench/MatrixPseudoInv/Direct/main.cpp b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
similarity index 94%
rename from bench/MatrixPseudoInv/Direct/main.cpp
rename to bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
index 6339c24..1e55969 100644
--- a/bench/MatrixPseudoInv/Direct/main.cpp
+++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot16/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
index 4c4bacd..a333391 100755
--- a/bench/VectorDotProd/VecDot16/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot16/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
index 4d2179f..d154206 100644
--- a/bench/VectorDotProd/VecDot16/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_2/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
index 18d099c..e0c3df7 100755
--- a/bench/VectorDotProd/VecDot16_2/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
similarity index 92%
rename from bench/VectorDotProd/VecDot16_2/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
index 356bcfb..b1e24ea 100644
--- a/bench/VectorDotProd/VecDot16_2/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_2_32bit/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
index 3f1b23a..2be5f31 100755
--- a/bench/VectorDotProd/VecDot16_2_32bit/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
similarity index 92%
rename from bench/VectorDotProd/VecDot16_2_32bit/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
index 2e6377e..bc51b6a 100644
--- a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
similarity index 83%
rename from bench/VectorDotProd/VecDot16_32bit/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
index 2dd7bdd..70c0455 100755
--- a/bench/VectorDotProd/VecDot16_32bit/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot16_32bit/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
index 0a0f27d..618feeb 100644
--- a/bench/VectorDotProd/VecDot16_32bit/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot32/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
index 13764c4..37a8b26 100755
--- a/bench/VectorDotProd/VecDot32/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
similarity index 90%
rename from bench/VectorDotProd/VecDot32/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
index 5ecf2c1..c2aee11 100644
--- a/bench/VectorDotProd/VecDot32/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
similarity index 82%
rename from bench/VectorDotProd/VecDot32_2/bench.py
rename to bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
index c249222..0a4b022 100755
--- a/bench/VectorDotProd/VecDot32_2/bench.py
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
similarity index 75%
rename from bench/VectorDotProd/VecDot32_2/main.cpp
rename to bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
index 0365c30..a379d9e 100644
--- a/bench/VectorDotProd/VecDot32_2/main.cpp
+++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -14,12 +14,12 @@ void run_vec_dot_f32_2_perf(uint32_t num_samples, uint32_t num_reps) {
   printf("[VECDOT f32 2] - number of samples = %u, number of iterations = %u\n",
          num_samples, num_reps);
 
-  const std::vector<float> a_re(num_samples);
-  const std::vector<float> a_im(num_samples);
-  const std::vector<float> b_re(num_samples);
-  const std::vector<float> b_im(num_samples);
-  float c_re;
-  float c_im;
+  const std::vector<float32_t> a_re(num_samples);
+  const std::vector<float32_t> a_im(num_samples);
+  const std::vector<float32_t> b_re(num_samples);
+  const std::vector<float32_t> b_im(num_samples);
+  float32_t c_re;
+  float32_t c_im;
 
   const auto *a_re_ptr = a_re.data();
   const auto *a_im_ptr = a_im.data();
diff --git a/bench/ElemWiseVectorMult/VecMul16/bench.py b/bench/BasicMathFun/VectorMult/VecMul16/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul16/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul16/bench.py
index c8d40c3..e6f953e 100755
--- a/bench/ElemWiseVectorMult/VecMul16/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp
similarity index 91%
rename from bench/ElemWiseVectorMult/VecMul16/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul16/main.cpp
index 3c40c29..e6999be 100644
--- a/bench/ElemWiseVectorMult/VecMul16/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul16_2/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
index ee9f806..99f3d83 100755
--- a/bench/ElemWiseVectorMult/VecMul16_2/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
similarity index 92%
rename from bench/ElemWiseVectorMult/VecMul16_2/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
index 758eece..876dd37 100644
--- a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32/bench.py b/bench/BasicMathFun/VectorMult/VecMul32/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul32/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul32/bench.py
index b8007ef..ac6de33 100755
--- a/bench/ElemWiseVectorMult/VecMul32/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp
similarity index 91%
rename from bench/ElemWiseVectorMult/VecMul32/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul32/main.cpp
index 76c149a..07fec66 100644
--- a/bench/ElemWiseVectorMult/VecMul32/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
similarity index 82%
rename from bench/ElemWiseVectorMult/VecMul32_2/bench.py
rename to bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
index ea08bf9..c7936ea 100755
--- a/bench/ElemWiseVectorMult/VecMul32_2/bench.py
+++ b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
similarity index 74%
rename from bench/ElemWiseVectorMult/VecMul32_2/main.cpp
rename to bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
index ec1a210..b42d6ad 100644
--- a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp
+++ b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -14,12 +14,12 @@ void run_vec_mul_f32_2_perf(uint32_t num_samples, uint32_t num_reps) {
   printf("[VECMUL f32_2] - number of samples = %u, number of iterations = %u\n",
          num_samples, num_reps);
 
-  const std::vector<float> a_re(num_samples);
-  const std::vector<float> a_im(num_samples);
-  const std::vector<float> b_re(num_samples);
-  const std::vector<float> b_im(num_samples);
-  std::vector<float> c_re(num_samples);
-  std::vector<float> c_im(num_samples);
+  const std::vector<float32_t> a_re(num_samples);
+  const std::vector<float32_t> a_im(num_samples);
+  const std::vector<float32_t> b_re(num_samples);
+  const std::vector<float32_t> b_im(num_samples);
+  std::vector<float32_t> c_re(num_samples);
+  std::vector<float32_t> c_im(num_samples);
 
   const auto *a_re_ptr = a_re.data();
   const auto *a_im_ptr = a_im.data();
diff --git a/bench/MuLaw/Compression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/14bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
index 2b9ab70..95720ea 100755
--- a/bench/MuLaw/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/14bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
index 73b7699..0cd606c 100644
--- a/bench/MuLaw/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/8bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
index 43cefd8..f55e33b 100755
--- a/bench/MuLaw/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/8bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
index 2faa911..8a489e1 100644
--- a/bench/MuLaw/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
similarity index 87%
rename from bench/MuLaw/Compression/9bit/bench.py
rename to bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
index cc24e67..82fc07f 100755
--- a/bench/MuLaw/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Compression/9bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
index a2c1118..f88240e 100644
--- a/bench/MuLaw/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/14bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
index 8f6d2b1..48cb1fd 100755
--- a/bench/MuLaw/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
index a24bf21..d0b3498 100644
--- a/bench/MuLaw/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/8bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
index f70ecaf..0444d8a 100755
--- a/bench/MuLaw/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
index c3a0f0a..2a50c5d 100644
--- a/bench/MuLaw/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
similarity index 87%
rename from bench/MuLaw/Decompression/9bit/bench.py
rename to bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
index 67512df..2bfe560 100755
--- a/bench/MuLaw/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/MuLaw/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
index 2bcde05..c3b1b85 100644
--- a/bench/MuLaw/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/12bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
index 744bd01..54bce88 100755
--- a/bench/XRanBlockFloat/Compression/12bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/12bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
index ec36a75..b4c34d7 100644
--- a/bench/XRanBlockFloat/Compression/12bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
index 10f2e16..3b30a95 100755
--- a/bench/XRanBlockFloat/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
index eff8698..eb1cc6a 100644
--- a/bench/XRanBlockFloat/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
index 3e5f2f3..baac152 100755
--- a/bench/XRanBlockFloat/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
index 1aa7d2c..be88dc1 100644
--- a/bench/XRanBlockFloat/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Compression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
index 73391e2..2dfa15d 100755
--- a/bench/XRanBlockFloat/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Compression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
index 6a96d35..a253642 100644
--- a/bench/XRanBlockFloat/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/12bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
index f9ec6f8..1a0883a 100755
--- a/bench/XRanBlockFloat/Decompression/12bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/12bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
index 9816ac1..fc9b8a8 100644
--- a/bench/XRanBlockFloat/Decompression/12bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
index 1f08f50..6b38d69 100755
--- a/bench/XRanBlockFloat/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
index 52226a5..285eaa6 100644
--- a/bench/XRanBlockFloat/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
index f20eb2b..8f6f7e8 100755
--- a/bench/XRanBlockFloat/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
index 7734d12..8ba5be5 100644
--- a/bench/XRanBlockFloat/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
similarity index 87%
rename from bench/XRanBlockFloat/Decompression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
index 5cf57a7..c19dff8 100755
--- a/bench/XRanBlockFloat/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/XRanBlockFloat/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
index 1e868ff..8071995 100644
--- a/bench/XRanBlockFloat/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
index e2b2f15..efc7012 100755
--- a/bench/ORanBlockScaling/Compression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
index 37f8da1..754710c 100644
--- a/bench/ORanBlockScaling/Compression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
index 65d5537..78bc308 100755
--- a/bench/ORanBlockScaling/Compression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
index 43286ca..73958ae 100644
--- a/bench/ORanBlockScaling/Compression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Compression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
index 54f9931..bfcc068 100755
--- a/bench/ORanBlockScaling/Compression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Compression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
index 7d66f42..b3436d7 100644
--- a/bench/ORanBlockScaling/Compression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/14bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
index cbb57d2..ac7429b 100755
--- a/bench/ORanBlockScaling/Decompression/14bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/14bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
index a9448f9..6e07821 100644
--- a/bench/ORanBlockScaling/Decompression/14bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/8bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
index 2807325..7d12222 100755
--- a/bench/ORanBlockScaling/Decompression/8bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/8bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
index 60ffeec..f5bedca 100644
--- a/bench/ORanBlockScaling/Decompression/8bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
similarity index 87%
rename from bench/ORanBlockScaling/Decompression/9bit/bench.py
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
index f16d82d..4a2ab15 100755
--- a/bench/ORanBlockScaling/Decompression/9bit/bench.py
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
similarity index 93%
rename from bench/ORanBlockScaling/Decompression/9bit/main.cpp
rename to bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
index 8cbab20..2c6aa9e 100644
--- a/bench/ORanBlockScaling/Decompression/9bit/main.cpp
+++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Correlation/bench.py b/bench/LowerPHY/Correlation/bench.py
similarity index 82%
rename from bench/Correlation/bench.py
rename to bench/LowerPHY/Correlation/bench.py
index 9a36a39..e2a9be1 100755
--- a/bench/Correlation/bench.py
+++ b/bench/LowerPHY/Correlation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Correlation/main.cpp b/bench/LowerPHY/Correlation/main.cpp
similarity index 89%
rename from bench/Correlation/main.cpp
rename to bench/LowerPHY/Correlation/main.cpp
index 068172f..7315a83 100644
--- a/bench/Correlation/main.cpp
+++ b/bench/LowerPHY/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT16/bench.py b/bench/LowerPHY/FFT/FFT16/bench.py
similarity index 87%
rename from bench/FFT/FFT16/bench.py
rename to bench/LowerPHY/FFT/FFT16/bench.py
index f560e04..6e21900 100755
--- a/bench/FFT/FFT16/bench.py
+++ b/bench/LowerPHY/FFT/FFT16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT16/main.cpp b/bench/LowerPHY/FFT/FFT16/main.cpp
similarity index 92%
rename from bench/FFT/FFT16/main.cpp
rename to bench/LowerPHY/FFT/FFT16/main.cpp
index 2ce24b0..1bf340e 100644
--- a/bench/FFT/FFT16/main.cpp
+++ b/bench/LowerPHY/FFT/FFT16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT32/bench.py b/bench/LowerPHY/FFT/FFT32/bench.py
similarity index 87%
rename from bench/FFT/FFT32/bench.py
rename to bench/LowerPHY/FFT/FFT32/bench.py
index 83a3e73..e84f4fc 100755
--- a/bench/FFT/FFT32/bench.py
+++ b/bench/LowerPHY/FFT/FFT32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT32/main.cpp b/bench/LowerPHY/FFT/FFT32/main.cpp
similarity index 92%
rename from bench/FFT/FFT32/main.cpp
rename to bench/LowerPHY/FFT/FFT32/main.cpp
index d20456b..1d469fd 100644
--- a/bench/FFT/FFT32/main.cpp
+++ b/bench/LowerPHY/FFT/FFT32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16/bench.py b/bench/LowerPHY/FIR/FIR16/bench.py
similarity index 83%
rename from bench/FIR/FIR16/bench.py
rename to bench/LowerPHY/FIR/FIR16/bench.py
index f0b19e8..23a3626 100755
--- a/bench/FIR/FIR16/bench.py
+++ b/bench/LowerPHY/FIR/FIR16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16/main.cpp b/bench/LowerPHY/FIR/FIR16/main.cpp
similarity index 93%
rename from bench/FIR/FIR16/main.cpp
rename to bench/LowerPHY/FIR/FIR16/main.cpp
index 58ee2c8..aae2b72 100644
--- a/bench/FIR/FIR16/main.cpp
+++ b/bench/LowerPHY/FIR/FIR16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16Decimate2/bench.py b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py
similarity index 84%
rename from bench/FIR/FIR16Decimate2/bench.py
rename to bench/LowerPHY/FIR/FIR16Decimate2/bench.py
index 956ca7c..bd47c5d 100755
--- a/bench/FIR/FIR16Decimate2/bench.py
+++ b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
similarity index 93%
rename from bench/FIR/FIR16Decimate2/main.cpp
rename to bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
index 8b8265a..f11ee5f 100644
--- a/bench/FIR/FIR16Decimate2/main.cpp
+++ b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32/bench.py b/bench/LowerPHY/FIR/FIR32/bench.py
similarity index 83%
rename from bench/FIR/FIR32/bench.py
rename to bench/LowerPHY/FIR/FIR32/bench.py
index 86757b6..bb24247 100755
--- a/bench/FIR/FIR32/bench.py
+++ b/bench/LowerPHY/FIR/FIR32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32/main.cpp b/bench/LowerPHY/FIR/FIR32/main.cpp
similarity index 93%
rename from bench/FIR/FIR32/main.cpp
rename to bench/LowerPHY/FIR/FIR32/main.cpp
index 02e3b08..b376ccd 100644
--- a/bench/FIR/FIR32/main.cpp
+++ b/bench/LowerPHY/FIR/FIR32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32Decimate2/bench.py b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py
similarity index 84%
rename from bench/FIR/FIR32Decimate2/bench.py
rename to bench/LowerPHY/FIR/FIR32Decimate2/bench.py
index 41fc6c1..f70853a 100755
--- a/bench/FIR/FIR32Decimate2/bench.py
+++ b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
similarity index 93%
rename from bench/FIR/FIR32Decimate2/main.cpp
rename to bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
index b663f19..d8ac010 100644
--- a/bench/FIR/FIR32Decimate2/main.cpp
+++ b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Scrambling/bench.py b/bench/LowerPHY/Scrambling/bench.py
similarity index 84%
rename from bench/Scrambling/bench.py
rename to bench/LowerPHY/Scrambling/bench.py
index ae4e285..ad7b7b2 100755
--- a/bench/Scrambling/bench.py
+++ b/bench/LowerPHY/Scrambling/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Scrambling/main.cpp b/bench/LowerPHY/Scrambling/main.cpp
similarity index 89%
rename from bench/Scrambling/main.cpp
rename to bench/LowerPHY/Scrambling/main.cpp
index 6d85a8f..5e1985e 100644
--- a/bench/Scrambling/main.cpp
+++ b/bench/LowerPHY/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/SeqGenerator/bench.py b/bench/LowerPHY/SeqGenerator/bench.py
similarity index 84%
rename from bench/SeqGenerator/bench.py
rename to bench/LowerPHY/SeqGenerator/bench.py
index 7d8ae27..64db32d 100755
--- a/bench/SeqGenerator/bench.py
+++ b/bench/LowerPHY/SeqGenerator/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/SeqGenerator/main.cpp b/bench/LowerPHY/SeqGenerator/main.cpp
similarity index 86%
rename from bench/SeqGenerator/main.cpp
rename to bench/LowerPHY/SeqGenerator/main.cpp
index 49baa2a..259e102 100644
--- a/bench/SeqGenerator/main.cpp
+++ b/bench/LowerPHY/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/bench/SVD/bench.py b/bench/MatrixFactorizations/SVD/bench.py
similarity index 86%
rename from bench/SVD/bench.py
rename to bench/MatrixFactorizations/SVD/bench.py
index 22a8591..4cb05bd 100755
--- a/bench/SVD/bench.py
+++ b/bench/MatrixFactorizations/SVD/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/SVD/main.cpp b/bench/MatrixFactorizations/SVD/main.cpp
similarity index 89%
rename from bench/SVD/main.cpp
rename to bench/MatrixFactorizations/SVD/main.cpp
index 86cba92..61e5444 100644
--- a/bench/SVD/main.cpp
+++ b/bench/MatrixFactorizations/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -19,9 +19,9 @@ void bench_svd(bool gen_singular_vectors, int m, int n, int nreps) {
   std::vector<armral_cmplx_f32_t> a(size, {1.0F, 1.0F});
   const int lda = n;
   for (int i = 0; i < n; ++i) {
-    a[i + lda * i] = armral_cmplx_f32_t{static_cast<float>(i + 2), 0};
+    a[i + lda * i] = armral_cmplx_f32_t{static_cast<float32_t>(i + 2), 0};
   }
-  std::vector<float> s(n);
+  std::vector<float32_t> s(n);
 
   // Left and right singular vectors.
   std::vector<armral_cmplx_f32_t> u;
diff --git a/bench/CRC/11/BigEndian/bench.py b/bench/UpperPHY/CRC/11/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/11/BigEndian/bench.py
rename to bench/UpperPHY/CRC/11/BigEndian/bench.py
index 6c6f668..b2c2777 100755
--- a/bench/CRC/11/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/11/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/BigEndian/main.cpp b/bench/UpperPHY/CRC/11/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/11/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/11/BigEndian/main.cpp
index d82dbd4..a75f3e8 100644
--- a/bench/CRC/11/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/11/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/11/LittleEndian/bench.py b/bench/UpperPHY/CRC/11/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/11/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/11/LittleEndian/bench.py
index 350c7ea..bca79a9 100755
--- a/bench/CRC/11/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/11/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/LittleEndian/main.cpp b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/11/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/11/LittleEndian/main.cpp
index 533b507..0e82518 100644
--- a/bench/CRC/11/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/BigEndian/bench.py b/bench/UpperPHY/CRC/16/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/16/BigEndian/bench.py
rename to bench/UpperPHY/CRC/16/BigEndian/bench.py
index 8bf0fc0..738b08a 100755
--- a/bench/CRC/16/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/16/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/BigEndian/main.cpp b/bench/UpperPHY/CRC/16/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/16/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/16/BigEndian/main.cpp
index a81ccf5..9265e41 100644
--- a/bench/CRC/16/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/16/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/LittleEndian/bench.py b/bench/UpperPHY/CRC/16/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/16/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/16/LittleEndian/bench.py
index 4c8ce83..5c6cc1f 100755
--- a/bench/CRC/16/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/16/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/LittleEndian/main.cpp b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/16/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/16/LittleEndian/main.cpp
index ded10e8..d1cd343 100644
--- a/bench/CRC/16/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/BigEndian/bench.py b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/A/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/A/BigEndian/bench.py
index a69cb7e..8052cae 100755
--- a/bench/CRC/24/A/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/A/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
index ee1e1c7..33313dd 100644
--- a/bench/CRC/24/A/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/A/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
index 576bafc..64c1ccc 100755
--- a/bench/CRC/24/A/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/A/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
index 17325f8..7c0e405 100644
--- a/bench/CRC/24/A/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/BigEndian/bench.py b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/B/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/B/BigEndian/bench.py
index aa31855..7396685 100755
--- a/bench/CRC/24/B/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/B/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
index 876deaf..c557b47 100644
--- a/bench/CRC/24/B/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/B/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
index cbd7e95..06bfea6 100755
--- a/bench/CRC/24/B/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/B/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
index b19eb35..b332e1b 100644
--- a/bench/CRC/24/B/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/BigEndian/bench.py b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/24/C/BigEndian/bench.py
rename to bench/UpperPHY/CRC/24/C/BigEndian/bench.py
index 42303ee..1df67fd 100755
--- a/bench/CRC/24/C/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/C/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
index e1a18f2..f4d8553 100644
--- a/bench/CRC/24/C/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/24/C/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
index 331bb26..70471b5 100755
--- a/bench/CRC/24/C/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/24/C/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
index d9c0a81..f3cfbc5 100644
--- a/bench/CRC/24/C/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/BigEndian/bench.py b/bench/UpperPHY/CRC/6/BigEndian/bench.py
similarity index 85%
rename from bench/CRC/6/BigEndian/bench.py
rename to bench/UpperPHY/CRC/6/BigEndian/bench.py
index bb64225..1bc3711 100755
--- a/bench/CRC/6/BigEndian/bench.py
+++ b/bench/UpperPHY/CRC/6/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/BigEndian/main.cpp b/bench/UpperPHY/CRC/6/BigEndian/main.cpp
similarity index 91%
rename from bench/CRC/6/BigEndian/main.cpp
rename to bench/UpperPHY/CRC/6/BigEndian/main.cpp
index b74b808..3ed97a4 100644
--- a/bench/CRC/6/BigEndian/main.cpp
+++ b/bench/UpperPHY/CRC/6/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/LittleEndian/bench.py b/bench/UpperPHY/CRC/6/LittleEndian/bench.py
similarity index 85%
rename from bench/CRC/6/LittleEndian/bench.py
rename to bench/UpperPHY/CRC/6/LittleEndian/bench.py
index 7878f82..7cb6378 100755
--- a/bench/CRC/6/LittleEndian/bench.py
+++ b/bench/UpperPHY/CRC/6/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/LittleEndian/main.cpp b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp
similarity index 91%
rename from bench/CRC/6/LittleEndian/main.cpp
rename to bench/UpperPHY/CRC/6/LittleEndian/main.cpp
index 8363eae..ab6958e 100644
--- a/bench/CRC/6/LittleEndian/main.cpp
+++ b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Decoding/bench.py b/bench/UpperPHY/ConvolutionalDecoder/bench.py
similarity index 85%
rename from bench/ConvCoding/Decoding/bench.py
rename to bench/UpperPHY/ConvolutionalDecoder/bench.py
index 16ebdb0..f9c42b3 100755
--- a/bench/ConvCoding/Decoding/bench.py
+++ b/bench/UpperPHY/ConvolutionalDecoder/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Decoding/main.cpp b/bench/UpperPHY/ConvolutionalDecoder/main.cpp
similarity index 94%
rename from bench/ConvCoding/Decoding/main.cpp
rename to bench/UpperPHY/ConvolutionalDecoder/main.cpp
index 8bc34dd..fbcfd53 100644
--- a/bench/ConvCoding/Decoding/main.cpp
+++ b/bench/UpperPHY/ConvolutionalDecoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Encoding/bench.py b/bench/UpperPHY/ConvolutionalEncoder/bench.py
similarity index 85%
rename from bench/ConvCoding/Encoding/bench.py
rename to bench/UpperPHY/ConvolutionalEncoder/bench.py
index fca556c..7dc34b6 100755
--- a/bench/ConvCoding/Encoding/bench.py
+++ b/bench/UpperPHY/ConvolutionalEncoder/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Encoding/main.cpp b/bench/UpperPHY/ConvolutionalEncoder/main.cpp
similarity index 92%
rename from bench/ConvCoding/Encoding/main.cpp
rename to bench/UpperPHY/ConvolutionalEncoder/main.cpp
index 8221726..65b9941 100644
--- a/bench/ConvCoding/Encoding/main.cpp
+++ b/bench/UpperPHY/ConvolutionalEncoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Demodulation/bench.py b/bench/UpperPHY/Demodulation/bench.py
similarity index 85%
rename from bench/Demodulation/bench.py
rename to bench/UpperPHY/Demodulation/bench.py
index 051554d..1a099a6 100755
--- a/bench/Demodulation/bench.py
+++ b/bench/UpperPHY/Demodulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Demodulation/main.cpp b/bench/UpperPHY/Demodulation/main.cpp
similarity index 95%
rename from bench/Demodulation/main.cpp
rename to bench/UpperPHY/Demodulation/main.cpp
index d6b9c92..e4e06fd 100644
--- a/bench/Demodulation/main.cpp
+++ b/bench/UpperPHY/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/LDPC/Decoding/bench.py b/bench/UpperPHY/LDPC/Decoding/bench.py
similarity index 92%
rename from bench/LDPC/Decoding/bench.py
rename to bench/UpperPHY/LDPC/Decoding/bench.py
index 620ad12..0476cc7 100755
--- a/bench/LDPC/Decoding/bench.py
+++ b/bench/UpperPHY/LDPC/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/LDPC/Decoding/main.cpp b/bench/UpperPHY/LDPC/Decoding/main.cpp
similarity index 95%
rename from bench/LDPC/Decoding/main.cpp
rename to bench/UpperPHY/LDPC/Decoding/main.cpp
index 9d26974..85acce3 100755
--- a/bench/LDPC/Decoding/main.cpp
+++ b/bench/UpperPHY/LDPC/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/bench/LDPC/Encoding/bench.py b/bench/UpperPHY/LDPC/Encoding/bench.py
similarity index 90%
rename from bench/LDPC/Encoding/bench.py
rename to bench/UpperPHY/LDPC/Encoding/bench.py
index dd8f9d4..3a8e7fb 100755
--- a/bench/LDPC/Encoding/bench.py
+++ b/bench/UpperPHY/LDPC/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/LDPC/Encoding/main.cpp b/bench/UpperPHY/LDPC/Encoding/main.cpp
similarity index 95%
rename from bench/LDPC/Encoding/main.cpp
rename to bench/UpperPHY/LDPC/Encoding/main.cpp
index d7c075a..cbc6cbd 100644
--- a/bench/LDPC/Encoding/main.cpp
+++ b/bench/UpperPHY/LDPC/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/bench/LDPC/RateMatching/bench.py b/bench/UpperPHY/LDPC/RateMatching/bench.py
similarity index 90%
rename from bench/LDPC/RateMatching/bench.py
rename to bench/UpperPHY/LDPC/RateMatching/bench.py
index 5d752ec..cc49114 100755
--- a/bench/LDPC/RateMatching/bench.py
+++ b/bench/UpperPHY/LDPC/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/LDPC/RateMatching/main.cpp b/bench/UpperPHY/LDPC/RateMatching/main.cpp
similarity index 96%
rename from bench/LDPC/RateMatching/main.cpp
rename to bench/UpperPHY/LDPC/RateMatching/main.cpp
index d99459a..5aa17c5 100644
--- a/bench/LDPC/RateMatching/main.cpp
+++ b/bench/UpperPHY/LDPC/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
diff --git a/bench/LDPC/RateRecovery/bench.py b/bench/UpperPHY/LDPC/RateRecovery/bench.py
similarity index 90%
rename from bench/LDPC/RateRecovery/bench.py
rename to bench/UpperPHY/LDPC/RateRecovery/bench.py
index 02463ca..8c00049 100755
--- a/bench/LDPC/RateRecovery/bench.py
+++ b/bench/UpperPHY/LDPC/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/LDPC/RateRecovery/main.cpp b/bench/UpperPHY/LDPC/RateRecovery/main.cpp
similarity index 95%
rename from bench/LDPC/RateRecovery/main.cpp
rename to bench/UpperPHY/LDPC/RateRecovery/main.cpp
index af9e056..469a4bd 100644
--- a/bench/LDPC/RateRecovery/main.cpp
+++ b/bench/UpperPHY/LDPC/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_coding.hpp"
diff --git a/bench/Modulation/bench.py b/bench/UpperPHY/Modulation/bench.py
similarity index 86%
rename from bench/Modulation/bench.py
rename to bench/UpperPHY/Modulation/bench.py
index e6dcff6..9933b7b 100755
--- a/bench/Modulation/bench.py
+++ b/bench/UpperPHY/Modulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Modulation/main.cpp b/bench/UpperPHY/Modulation/main.cpp
similarity index 95%
rename from bench/Modulation/main.cpp
rename to bench/UpperPHY/Modulation/main.cpp
index 5e0f7ba..bb777f0 100644
--- a/bench/Modulation/main.cpp
+++ b/bench/UpperPHY/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Decoding/bench.py b/bench/UpperPHY/Polar/Decoding/bench.py
similarity index 88%
rename from bench/Polar/Decoding/bench.py
rename to bench/UpperPHY/Polar/Decoding/bench.py
index b9b3ad6..5cddc12 100755
--- a/bench/Polar/Decoding/bench.py
+++ b/bench/UpperPHY/Polar/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Decoding/main.cpp b/bench/UpperPHY/Polar/Decoding/main.cpp
similarity index 89%
rename from bench/Polar/Decoding/main.cpp
rename to bench/UpperPHY/Polar/Decoding/main.cpp
index 6da1928..31e89fb 100644
--- a/bench/Polar/Decoding/main.cpp
+++ b/bench/UpperPHY/Polar/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "rng.hpp"
@@ -30,8 +30,8 @@ void run_polar_decoding_perf(uint32_t n, uint32_t k, uint32_t l,
   // microarchitectural branch prediction are too optimistic and give an
   // unrealistically fast result. We use a linear congruential generator to
   // avoid calling rand() or C++ random number generators, which are both slow.
-  linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({42});
+  armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({42});
   for (uint32_t i = 0; i < n; ++i) {
     ((uint8_t *)a.data())[i] = lcg.one<uint32_t>(&state);
   }
diff --git a/bench/Polar/Encoding/bench.py b/bench/UpperPHY/Polar/Encoding/bench.py
similarity index 84%
rename from bench/Polar/Encoding/bench.py
rename to bench/UpperPHY/Polar/Encoding/bench.py
index d01db8b..d05b5db 100755
--- a/bench/Polar/Encoding/bench.py
+++ b/bench/UpperPHY/Polar/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Encoding/main.cpp b/bench/UpperPHY/Polar/Encoding/main.cpp
similarity index 89%
rename from bench/Polar/Encoding/main.cpp
rename to bench/UpperPHY/Polar/Encoding/main.cpp
index 86bd403..a1cab8b 100644
--- a/bench/Polar/Encoding/main.cpp
+++ b/bench/UpperPHY/Polar/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Frozen/bench.py b/bench/UpperPHY/Polar/Frozen/bench.py
similarity index 93%
rename from bench/Polar/Frozen/bench.py
rename to bench/UpperPHY/Polar/Frozen/bench.py
index c25d3c8..50648a2 100755
--- a/bench/Polar/Frozen/bench.py
+++ b/bench/UpperPHY/Polar/Frozen/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Frozen/main.cpp b/bench/UpperPHY/Polar/Frozen/main.cpp
similarity index 93%
rename from bench/Polar/Frozen/main.cpp
rename to bench/UpperPHY/Polar/Frozen/main.cpp
index 8db346a..5ba5e35 100644
--- a/bench/Polar/Frozen/main.cpp
+++ b/bench/UpperPHY/Polar/Frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateMatching/bench.py b/bench/UpperPHY/Polar/RateMatching/bench.py
similarity index 91%
rename from bench/Polar/RateMatching/bench.py
rename to bench/UpperPHY/Polar/RateMatching/bench.py
index fa5715f..92c0535 100755
--- a/bench/Polar/RateMatching/bench.py
+++ b/bench/UpperPHY/Polar/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateMatching/main.cpp b/bench/UpperPHY/Polar/RateMatching/main.cpp
similarity index 94%
rename from bench/Polar/RateMatching/main.cpp
rename to bench/UpperPHY/Polar/RateMatching/main.cpp
index af6a831..a5bf08a 100644
--- a/bench/Polar/RateMatching/main.cpp
+++ b/bench/UpperPHY/Polar/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateRecovery/bench.py b/bench/UpperPHY/Polar/RateRecovery/bench.py
similarity index 91%
rename from bench/Polar/RateRecovery/bench.py
rename to bench/UpperPHY/Polar/RateRecovery/bench.py
index 4687b6d..a2a2c3f 100755
--- a/bench/Polar/RateRecovery/bench.py
+++ b/bench/UpperPHY/Polar/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateRecovery/main.cpp b/bench/UpperPHY/Polar/RateRecovery/main.cpp
similarity index 94%
rename from bench/Polar/RateRecovery/main.cpp
rename to bench/UpperPHY/Polar/RateRecovery/main.cpp
index b687110..019b4a5 100644
--- a/bench/Polar/RateRecovery/main.cpp
+++ b/bench/UpperPHY/Polar/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelDeinterleave/bench.py b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
similarity index 88%
rename from bench/Polar/SubchannelDeinterleave/bench.py
rename to bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
index d804d3b..29fd5bc 100755
--- a/bench/Polar/SubchannelDeinterleave/bench.py
+++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelDeinterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
similarity index 93%
rename from bench/Polar/SubchannelDeinterleave/main.cpp
rename to bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
index 54e9108..e5bb27d 100644
--- a/bench/Polar/SubchannelDeinterleave/main.cpp
+++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelInterleave/bench.py b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py
similarity index 88%
rename from bench/Polar/SubchannelInterleave/bench.py
rename to bench/UpperPHY/Polar/SubchannelInterleave/bench.py
index 8620391..de89975 100755
--- a/bench/Polar/SubchannelInterleave/bench.py
+++ b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelInterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
similarity index 93%
rename from bench/Polar/SubchannelInterleave/main.cpp
rename to bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
index c2623be..01d3db4 100644
--- a/bench/Polar/SubchannelInterleave/main.cpp
+++ b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/Decoding/bench.py b/bench/UpperPHY/Turbo/Decoding/bench.py
similarity index 84%
rename from bench/Turbo/Decoding/bench.py
rename to bench/UpperPHY/Turbo/Decoding/bench.py
index ebd3e38..11c546a 100755
--- a/bench/Turbo/Decoding/bench.py
+++ b/bench/UpperPHY/Turbo/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Decoding/main.cpp b/bench/UpperPHY/Turbo/Decoding/main.cpp
similarity index 94%
rename from bench/Turbo/Decoding/main.cpp
rename to bench/UpperPHY/Turbo/Decoding/main.cpp
index b0e21bb..1362f23 100644
--- a/bench/Turbo/Decoding/main.cpp
+++ b/bench/UpperPHY/Turbo/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
@@ -42,13 +42,13 @@ void run_turbo_decoding_perf(const uint32_t num_prbs, const uint32_t num_bits,
       buffer_bump_allocator allocator{buffer.data()};
       armral::turbo::decode_block<false, buffer_bump_allocator>(
           sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4),
-          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.0,
+          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F,
           num_iters, allocator);
 #else
       heap_allocator allocator{};
       armral::turbo::decode_block<false, heap_allocator>(
           sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4),
-          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.0,
+          itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F,
           num_iters, allocator);
 #endif
     }
diff --git a/bench/Turbo/Encoding/bench.py b/bench/UpperPHY/Turbo/Encoding/bench.py
similarity index 84%
rename from bench/Turbo/Encoding/bench.py
rename to bench/UpperPHY/Turbo/Encoding/bench.py
index 5c1db10..a50972f 100755
--- a/bench/Turbo/Encoding/bench.py
+++ b/bench/UpperPHY/Turbo/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Encoding/main.cpp b/bench/UpperPHY/Turbo/Encoding/main.cpp
similarity index 95%
rename from bench/Turbo/Encoding/main.cpp
rename to bench/UpperPHY/Turbo/Encoding/main.cpp
index b79df85..a4d3979 100644
--- a/bench/Turbo/Encoding/main.cpp
+++ b/bench/UpperPHY/Turbo/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateMatching/bench.py b/bench/UpperPHY/Turbo/RateMatching/bench.py
similarity index 86%
rename from bench/Turbo/RateMatching/bench.py
rename to bench/UpperPHY/Turbo/RateMatching/bench.py
index a36a1ea..9ba9ee1 100755
--- a/bench/Turbo/RateMatching/bench.py
+++ b/bench/UpperPHY/Turbo/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateMatching/main.cpp b/bench/UpperPHY/Turbo/RateMatching/main.cpp
similarity index 95%
rename from bench/Turbo/RateMatching/main.cpp
rename to bench/UpperPHY/Turbo/RateMatching/main.cpp
index 809bf14..3148fa2 100644
--- a/bench/Turbo/RateMatching/main.cpp
+++ b/bench/UpperPHY/Turbo/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateRecovery/bench.py b/bench/UpperPHY/Turbo/RateRecovery/bench.py
similarity index 86%
rename from bench/Turbo/RateRecovery/bench.py
rename to bench/UpperPHY/Turbo/RateRecovery/bench.py
index 2cc54c2..3e74ded 100755
--- a/bench/Turbo/RateRecovery/bench.py
+++ b/bench/UpperPHY/Turbo/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateRecovery/main.cpp b/bench/UpperPHY/Turbo/RateRecovery/main.cpp
similarity index 95%
rename from bench/Turbo/RateRecovery/main.cpp
rename to bench/UpperPHY/Turbo/RateRecovery/main.cpp
index 61d0e78..38795a3 100644
--- a/bench/Turbo/RateRecovery/main.cpp
+++ b/bench/UpperPHY/Turbo/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/benchmarker.py b/bench/benchmarker.py
index d3c2d6a..5a42cd3 100755
--- a/bench/benchmarker.py
+++ b/bench/benchmarker.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 
 # This program is for benchmarking the performance of armral functions.
@@ -168,7 +168,7 @@ def ignore_sigint():
 def run_benchmarks_concurrent(cases):
     num_procs = multiprocessing.cpu_count()
     with multiprocessing.Pool(processes=num_procs, initializer=ignore_sigint, maxtasksperchild=1) as pool:
-        # serialise display_result rather than using imap to avoid racing prints.
+        # serialize display_result rather than using imap to avoid racing prints.
         return max(map(display_result, pool.imap(run_case, cases)))
 
 
diff --git a/bench/benchmarker_utils.py b/bench/benchmarker_utils.py
index c369eb2..370395d 100755
--- a/bench/benchmarker_utils.py
+++ b/bench/benchmarker_utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 import collections
 import os
 import subprocess
@@ -12,13 +12,15 @@ NETFUL_EXPECTED_TIMEOUT_RETCODE = 3
 NETFUL_ALLOW_ERROR_RETCODE = 4
 
 
-def shell(cmd, check=True):
+def shell(cmd, check=True, **kwargs):
     """
     Run cmd on the command line and return stdout. Throws an exception
-    if the return code is non-zero.
+    if the return code is non-zero. Remaining kwargs are passed on to
+    subprocess.run().
     """
     result = subprocess.run(cmd, stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE, check=check)
+                            stderr=subprocess.PIPE, check=check,
+                            **kwargs)
     return ShellResult(result.returncode,
                        result.stdout.decode("utf-8"),
                        result.stderr.decode("utf-8"))
diff --git a/bench/default_runner.py b/bench/default_runner.py
index e5cb3ca..ee02254 100755
--- a/bench/default_runner.py
+++ b/bench/default_runner.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import argparse
diff --git a/docs/doxywrapper/arm_infra_html.css b/docs/doxywrapper/arm_infra_html.css
index f21550b..1a83e1d 100644
--- a/docs/doxywrapper/arm_infra_html.css
+++ b/docs/doxywrapper/arm_infra_html.css
@@ -1333,7 +1333,7 @@ tr.heading h2 {
 }
 
 #powerTip div.ttdoc {
-        color: grey;
+        color: gray;
 	font-style: italic;
 }
 
diff --git a/docs/examples.md b/docs/examples.md
index ebfeff5..b53a8fb 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -3,7 +3,7 @@
 This topic describes how to compile and link your application code to Arm RAN
 Acceleration Library (ArmRAL).
 
-# Before you begin
+## Before you begin
 
 * Ensure you have a recent version of a C/C++ compiler, such as GCC. See the
   Release Notes for a full list of supported GCC versions.
@@ -17,9 +17,9 @@ Acceleration Library (ArmRAL).
 
   To build the library, use:
 
-      tar zxvf ral-armral-24.01.tar.gz
-      mkdir ral-armral-24.01/build
-      cd ral-armral-24.01/build
+      git clone -b armral-24.04 https://git.gitlab.arm.com/networking/ral.git
+      mkdir ral/build
+      cd ral/build
       cmake ..
       make -j
 
@@ -28,7 +28,7 @@ Acceleration Library (ArmRAL).
 
       #include "armral.h"
 
-# Procedure
+## Procedure
 
 1. Build and link your program with Arm RAN Acceleration Library. For GCC, use:
 
@@ -49,7 +49,7 @@ Acceleration Library (ArmRAL).
 
        ./<binary-filename>
 
-# Example: Run 'fft_cf32_example.c'
+## Example: Run 'fft_cf32_example.c'
 
 In this example, we use Arm RAN Acceleration Library to compute and
 solve a simple Fast Fourier Transform (FFT) problem.
@@ -103,8 +103,7 @@ The following source file can be found in the ArmRAL source directory under
          (-3.312299 + 1.687701i)
          (-5.940955 + -0.940955i)
 
-
-# Other examples: block-float, modulation, and polar examples
+## Other examples: block-float, modulation, and polar examples
 
 Arm RAN Acceleration Library also includes block-float, modulation, and polar
 examples. These example files can also be found in the `/examples/` directory.
@@ -149,9 +148,9 @@ included:
 
   The example binary takes three arguments, in the following order:
 
-  1. The polar code size (`N`)
-  2. The rate-matched codeword length (`E`)
-  3. The number of information bits (`K`)
+      1. The polar code size (`N`)
+      2. The rate-matched codeword length (`E`)
+      3. The number of information bits (`K`)
 
   For example, to run a compiled binary of the `polar_example.cpp`, called,
   `polar_example`, with an input array of `N = 128`, `E = 100`, and `K = 35`,
@@ -162,6 +161,6 @@ included:
 Each example can be run according to the **Procedure** described above, as
 demonstrated in the **Example: Run 'fft_cf32_example.c'** section.
 
-# Related information
+## Related information
 
 For more information, see the **README** file.
diff --git a/docs/frontmatter.md b/docs/frontmatter.md
index c5fd9bf..fe55907 100644
--- a/docs/frontmatter.md
+++ b/docs/frontmatter.md
@@ -2,17 +2,27 @@
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
-# About this book
+## About this book
 
 This book contains reference documentation for Arm RAN Acceleration Library
 (ArmRAL). The book was generated from the source code using Doxygen.
 
-Arm RAN Acceleration Library contains a set of functions for accelerating
-telecommunications applications such as, but not limited to, 5G Radio Access
-Networks (RANs).
+Arm RAN Acceleration Library provides optimized signal processing and related
+maths functions for enabling 5G Radio Access Network (RAN) deployments. It
+leverages the efficient vector units available on Arm cores that support the
+Armv8-a architecture to accelerate 5G NR and LTE signal processing workloads,
+including:
+
+* Matrix and vector arithmetic, such as matrix multiplication.
+* Fast Fourier Transforms (FFTs).
+* Digital modulation and demodulation.
+* Cyclic Redundancy Check (CRC).
+* Encoding and decoding schemes, including Polar, Low-Density Parity
+  Check (LDPC), and Turbo.
+* Compression and decompression.
 
 You can download Arm RAN Acceleration Library for free from
-https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download.
+<https://developer.arm.com/solutions/infrastructure/developer-resources/5g/ran/download>.
 
 Arm RAN Acceleration Library is built as a static library, and must be linked in
 to any executable that needs to use the library. The source code can be
@@ -22,9 +32,9 @@ in the `src` directory, testing code is located in the `test` directory,
 benchmarking code is located in the `bench` directory, and examples are
 located in the `examples` directory.
 
-# Feedback
+## Feedback
 
-## Feedback on this product
+### Feedback on this product
 
 If you have any comments or suggestions about this product, contact your
 supplier and give:
@@ -34,18 +44,18 @@ supplier and give:
 * An explanation with as much information as you can provide. Include symptoms
   and diagnostic procedures if appropriate.
 
-## Feedback on content
+### Feedback on content
 
 If you have any comments on content, send an e-mail to errata@arm.com. Give:
 
 * The title Arm RAN Acceleration Library Reference Guide.
-* The number 102249_2401_00_en.
+* The number 102249_2404_00_en.
 * If applicable, the relevant page number(s) to which your comments refer.
 * A concise explanation of your comments.
 
 Arm also welcomes general suggestions for additions and improvements.
 
-# Non-Confidential Proprietary Notice
+## Non-Confidential Proprietary Notice
 
 This document is protected by copyright and other related rights and the
 practice or implementation of the information contained in this document may be
@@ -93,7 +103,7 @@ The Arm corporate logo and words marked with ® or ™ are registered trademarks
 trademarks of Arm Limited (or its affiliates) in the US and/or elsewhere. All
 rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm's trademark usage
-guidelines at https://www.arm.com/company/policies/trademarks.
+guidelines at <https://www.arm.com/company/policies/trademarks>.
 
 Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
@@ -103,7 +113,7 @@ Arm Limited. Company 02557590 registered in England.
 
 (LES-PRE-20349)
 
-# Confidentiality Status
+## Confidentiality Status
 
 This document is Non-Confidential. The right to use, copy and disclose this
 document may be subject to license restrictions in accordance with the terms of
@@ -112,15 +122,15 @@ to.
 
 Unrestricted Access is an Arm internal classification.
 
-# Product Status
+## Product Status
 
 The information in this document is Final, that is for a developed product.
 
-# Web Address
+## Web Address
 
-https://developer.arm.com
+<https://developer.arm.com>
 
-# Progressive terminology commitment
+## Progressive terminology commitment
 
 Arm values inclusive communities. Arm recognizes that we and our industry have
 used language that can be offensive. Arm strives to lead the industry and create
@@ -129,9 +139,9 @@ change.
 We believe that this document contains no offensive terms.
 If you find offensive terms in this document, please contact terms@arm.com.
 
-# Release Information
+## Release Information
 
-## Document History
+### Document History
 
 Issue   | Date            | Confidentiality  | Change
 --------|-----------------|------------------|-----------------------------------------------------
@@ -149,3 +159,4 @@ Issue   | Date            | Confidentiality  | Change
 2307-00 | 07 July 2023    | Non-Confidential | Update for Arm RAN Acceleration Library v23.07
 2310-00 | 06 October 2023 | Non-Confidential | Update for Arm RAN Acceleration Library v23.10
 2401-00 | 19 January 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.01
+2404-00 | 19 April 2024   | Non-Confidential | Update for Arm RAN Acceleration Library v24.04
diff --git a/examples/block_float_9b_example.c b/examples/block_float_9b_example.c
index 8abe390..1a48eb9 100644
--- a/examples/block_float_9b_example.c
+++ b/examples/block_float_9b_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/fft_cf32_example.c b/examples/fft_cf32_example.c
index 690d876..4fc4762 100644
--- a/examples/fft_cf32_example.c
+++ b/examples/fft_cf32_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -17,7 +17,7 @@ static void example_fft_plan_and_execute(int n) {
   armral_fft_create_plan_cf32(&p, n, -1);
 
   // Create the data that is to be used in FFTs. The input array (x) needs to
-  // be initialised. The output array (y) does not.
+  // be initialized. The output array (y) does not.
   armral_cmplx_f32_t *x =
       (armral_cmplx_f32_t *)malloc(n * sizeof(armral_cmplx_f32_t));
   armral_cmplx_f32_t *y =
diff --git a/examples/modulation_example.c b/examples/modulation_example.c
index 3ee95d6..94538ac 100644
--- a/examples/modulation_example.c
+++ b/examples/modulation_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/polar_example.cpp b/examples/polar_example.cpp
index 5648f30..d2b9f81 100644
--- a/examples/polar_example.cpp
+++ b/examples/polar_example.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/include/armral.h b/include/armral.h
index 2f247eb..60592f0 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -193,8 +193,8 @@ typedef struct {
  * \brief 32-bit floating-point complex data type.
  */
 typedef struct {
-  float re; ///< 32-bit real component.
-  float im; ///< 32-bit imaginary component.
+  float32_t re; ///< 32-bit real component.
+  float32_t im; ///< 32-bit imaginary component.
 } armral_cmplx_f32_t;
 
 /**
@@ -418,10 +418,11 @@ armral_status armral_cmplx_vecmul_f32(int32_t n, const armral_cmplx_f32_t *a,
  * @param[out] c_im  Points to the imaginary part of the output result.
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *a_re,
-                                        const float *a_im, const float *b_re,
-                                        const float *b_im, float *c_re,
-                                        float *c_im);
+armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float32_t *a_re,
+                                        const float32_t *a_im,
+                                        const float32_t *b_re,
+                                        const float32_t *b_im, float32_t *c_re,
+                                        float32_t *c_im);
 
 /** @} end of cmplx_by_cmplx_mult group */
 
@@ -489,11 +490,12 @@ armral_status armral_cmplx_vecdot_f32(int32_t n,
  * @param[out] p_src_c_im  Points to the imaginary part of the output result.
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_cmplx_vecdot_f32_2(int32_t n, const float *p_src_a_re,
-                                        const float *p_src_a_im,
-                                        const float *p_src_b_re,
-                                        const float *p_src_b_im,
-                                        float *p_src_c_re, float *p_src_c_im);
+armral_status armral_cmplx_vecdot_f32_2(int32_t n, const float32_t *p_src_a_re,
+                                        const float32_t *p_src_a_im,
+                                        const float32_t *p_src_b_re,
+                                        const float32_t *p_src_b_im,
+                                        float32_t *p_src_c_re,
+                                        float32_t *p_src_c_im);
 
 /**
  * This algorithm computes the dot product between a pair of arrays of complex
@@ -851,7 +853,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_i16_32bit_pa(
 /**
  * This algorithm performs the multiplication `A x` for matrix `A` and vector
  * `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * @param[in]     m          The number of rows in matrix `A` and the length of
@@ -872,7 +874,7 @@ armral_status armral_cmplx_mat_vec_mult_f32(uint16_t m, uint16_t n,
  * This algorithm performs matrix-vector multiplication for a batch of
  * `M-by-N` matrices and length `N` input vectors. Each multiplication is of the
  * form `A x` for a matrix `A` and vector `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * The matrix elements are interleaved such that all elements for a particular
@@ -915,7 +917,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
  * `M-by-N` matrices and length `N` input vectors, utilizing a "pointer array"
  * storage layout for the input and output matrix batches. Each multiplication
  * is of the form `A x` for a matrix `A` and vector `x`, and assumes that:
- * + Matrix and vector elements are complex float values.
+ * + Matrix and vector elements are complex 32-bit float values.
  * + Matrices are stored in memory in row-major order.
  *
  * The `p_srcs_a` parameter is an array of pointers of length `M * N`. The
@@ -1651,9 +1653,9 @@ armral_cmplx_mat_inverse_batch_f32_pa(uint32_t num_mats, uint32_t size,
  * in memory, in row-major order.
  *
  * \note
- * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * - If `m <= n` the number of rows `m` in the input matrix must be 1, 2, 3, 4,
  * 8 or 16.
- * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * - If `m > n` the number of columns `n` in the input matrix must be 1, 2, 3,
  * 4, 8 or 16.
  *
  * @param[in]  m         The number of rows in input matrix `A`.
@@ -1698,9 +1700,9 @@ armral_cmplx_pseudo_inverse_direct_f32(uint16_t m, uint16_t n, float32_t lambda,
  * in memory, in row-major order.
  *
  * \note
- * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * - If `m <= n` the number of rows `m` in the input matrix must be 1, 2, 3, 4,
  * 8 or 16.
- * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * - If `m > n` the number of columns `n` in the input matrix must be 1, 2, 3,
  * 4, 8 or 16.
  *
  * This function takes a pre-allocated buffer (`buffer`) to use internally.
@@ -4329,7 +4331,7 @@ uint32_t armral_tail_biting_convolutional_decode_block_noalloc_buffer_size(
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_svd_cf32(bool vect, int m, int n, armral_cmplx_f32_t *a,
-                              float *s, armral_cmplx_f32_t *u,
+                              float32_t *s, armral_cmplx_f32_t *u,
                               armral_cmplx_f32_t *vt);
 
 /**
@@ -4387,7 +4389,7 @@ armral_status armral_svd_cf32(bool vect, int m, int n, armral_cmplx_f32_t *a,
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_svd_cf32_noalloc(bool vect, int m, int n,
-                                      armral_cmplx_f32_t *a, float *s,
+                                      armral_cmplx_f32_t *a, float32_t *s,
                                       armral_cmplx_f32_t *u,
                                       armral_cmplx_f32_t *vt, void *buffer);
 
diff --git a/python/benchmark_excel_summary.py b/python/benchmark_excel_summary.py
new file mode 100755
index 0000000..8dd3fe8
--- /dev/null
+++ b/python/benchmark_excel_summary.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# Arm RAN Acceleration Library
+# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+import argparse
+import json
+import pandas as pd
+import re
+import sys
+
+
+class UnpackedName:
+    """
+    A helper class for giving a nice sort order over benchmark names.
+    This essentially just splits the string on underscores and converts what it
+    can to integers for comparison, such that
+    e.g. decompression_9b_2 < decompression_9b_10
+    """
+
+    def __init__(self, name):
+        self.name = name
+        self.arr = re.findall(r"[^_.]+", name)
+        for i, elem in enumerate(self.arr):
+            nums = list(map(int, re.findall(r"[0-9]+", elem)))
+            self.arr[i] = (nums, elem)
+
+    def __lt__(self, other):
+        """
+        Lexographic comparison, with type-mismatches handled arbitrarily
+        """
+        for x, y in zip(self.arr, other.arr):
+            assert type(x) is type(y)
+            if x < y:
+                return True
+            if x > y:
+                return False
+        return len(self.arr) < len(other.arr)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+    def __len__(self):
+        return len(self.name)
+
+    def __str__(self):
+        return self.name
+
+
+def format_headers_set_widths(worksheet, workbook, df):
+    header_format = workbook.add_format({
+        'bold': True,
+        'align': 'center',
+        'border': 1})
+
+    # Write headers
+    for col_num, header in enumerate(df.columns.values):
+        worksheet.write(0, col_num, header, header_format)
+
+    # Set width for benchmark name column
+    worksheet.set_column(0, 0, max(len(x) for x in df.name))
+
+    # Set width for data columns
+    max_width = max(len(x) for x in df.columns.values)
+    worksheet.set_column(1, 1, max_width + 2)
+
+
+def write_worksheet(writer, workbook, df):
+    name = "results"
+
+    # Write the table to an Excel worksheet, leaving room to add headers
+    df.to_excel(
+        writer, sheet_name=name, startrow=1,
+        float_format="%.2f", header=False, index=False)
+
+    # Write headers and set column widths
+    worksheet = writer.sheets[name]
+    format_headers_set_widths(worksheet, workbook, df)
+
+
+def sort_table(df):
+    # Sort by the benchmark names
+    df = df.set_index(df.name)
+    new_ind = sorted(df.index.values, key=lambda x: UnpackedName(x))
+    return df.reindex(new_ind)
+
+
+def get_json_results(src):
+    json_data = []
+    with open(src, 'rb') as f:
+        for line in f:
+            json_data.append(json.loads(line))
+
+    # Create dataframe of results
+    df = pd.json_normalize(json_data)
+
+    # Drop columns that won't be used
+    drop_cols = df.columns.difference(['name', 'median_cycles'])
+    return df.drop(drop_cols, axis=1)
+
+
+def write_workbook(src, dst):
+    # Get dataframes for data and sort by benchmark name
+    df_results = get_json_results(src)
+    df_results = sort_table(df_results)
+
+    # Create a workbook and add worksheets
+    writer = pd.ExcelWriter(dst, engine="xlsxwriter")
+    workbook = writer.book
+    write_worksheet(writer, workbook, df_results)
+
+    # Write the file
+    writer.close()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output", metavar="path",
+                        default="results.xlsx",
+                        help="specify output workbook path")
+    parser.add_argument("json_file", help="JSON benchmark results file")
+    args = parser.parse_args()
+
+    print("Writing Excel workbook to {}".format(args.output))
+    write_workbook(args.json_file, args.output)
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000..6907a90
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+xlsxwriter
diff --git a/simulation/CMakeLists.txt b/simulation/CMakeLists.txt
index cf7aa62..d0c5438 100644
--- a/simulation/CMakeLists.txt
+++ b/simulation/CMakeLists.txt
@@ -10,67 +10,118 @@ add_custom_target(simulation)
 
 # Interface for common simulation includes
 add_library(simulation_common INTERFACE)
-target_include_directories(simulation_common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_include_directories(simulation_common
+                           INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 function(set_omp_cxx_flags)
-    if (NOT OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
-        return()
-    endif()
-    check_c_compiler_flag(-fopenmp OPENMP_FLAG_IS_VALID)
-    if (OPENMP_FLAG_IS_VALID)
-        set(OpenMP_CXX_FLAGS "-fopenmp" PARENT_SCOPE)
-    else()
-        check_c_compiler_flag(-fopenmp=libomp OPENMP_FLAG_IS_VALID)
-        if (OPENMP_FLAG_IS_VALID)
-            set(OpenMP_CXX_FLAGS "-fopenmp=libomp" PARENT_SCOPE)
-        endif()
+  if(NOT OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
+    return()
+  endif()
+  check_c_compiler_flag(-fopenmp OPENMP_FLAG_IS_VALID)
+  if(OPENMP_FLAG_IS_VALID)
+    set(OpenMP_CXX_FLAGS
+        "-fopenmp"
+        PARENT_SCOPE)
+  else()
+    check_c_compiler_flag(-fopenmp=libomp OPENMP_FLAG_IS_VALID)
+    if(OPENMP_FLAG_IS_VALID)
+      set(OpenMP_CXX_FLAGS
+          "-fopenmp=libomp"
+          PARENT_SCOPE)
     endif()
+  endif()
 endfunction()
 
 find_package(OpenMP)
 find_package(Threads)
-if (Threads_FOUND)
-    # This is not the modern way of adding support for OpenMP
-    # but rather a fix for known issues when using CMake < 3.4
-    if(NOT TARGET OpenMP::OpenMP_CXX)
-        add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
-
-        set_omp_cxx_flags()
-        if (OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
-            # Sometimes we are failing to find OpenMP in testing. Needs more investigation, but in
-            # the meantime, just don't build the project
-            add_custom_target(ldpc_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(polar_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(turbo_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(convolutional_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            add_custom_target(modulation_awgn COMMAND cmake -E echo "OpenMP not found. Not building simulations")
-            return()
-        endif()
-
-        set_property(TARGET OpenMP::OpenMP_CXX
-                     PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS})
-        # Only works if the same flag is passed to the linker; use CMake 3.9+ otherwise.
-        set_property(TARGET OpenMP::OpenMP_CXX
-                     PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS} Threads::Threads)
+if(Threads_FOUND)
+  # This is not the modern way of adding support for OpenMP but rather a fix for
+  # known issues when using CMake < 3.4
+  if(NOT TARGET OpenMP::OpenMP_CXX)
+    add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
 
+    set_omp_cxx_flags()
+    if(OpenMP_CXX_FLAGS STREQUAL "NOTFOUND")
+      # Sometimes we are failing to find OpenMP in testing. Needs more
+      # investigation, but in the meantime, just don't build the project
+      add_custom_target(
+        ldpc_awgn COMMAND cmake -E echo
+                          "OpenMP not found. Not building simulations")
+      add_custom_target(
+        polar_awgn COMMAND cmake -E echo
+                           "OpenMP not found. Not building simulations")
+      add_custom_target(
+        turbo_awgn COMMAND cmake -E echo
+                           "OpenMP not found. Not building simulations")
+      add_custom_target(
+        convolutional_awgn COMMAND cmake -E echo
+                                   "OpenMP not found. Not building simulations")
+      add_custom_target(
+        modulation_awgn COMMAND cmake -E echo
+                                "OpenMP not found. Not building simulations")
+      return()
     endif()
 
-    # Actual simulation code
-    add_subdirectory(ldpc_awgn)
-    add_subdirectory(polar_awgn)
-    add_subdirectory(turbo_awgn)
-    add_subdirectory(convolutional_awgn)
-    add_subdirectory(modulation_awgn)
-else()
+    set_property(TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_COMPILE_OPTIONS
+                                                    ${OpenMP_CXX_FLAGS})
+    # Only works if the same flag is passed to the linker; use CMake 3.9+
+    # otherwise.
+    set_property(
+      TARGET OpenMP::OpenMP_CXX PROPERTY INTERFACE_LINK_LIBRARIES
+                                         ${OpenMP_CXX_FLAGS} Threads::Threads)
+  endif()
+
+  # Actual simulation code
+  function(add_armral_sim SIM_NAME SIM_CMD_LINE_OPTS)
+    set(SIM_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/${SIM_NAME}/${SIM_NAME}.cpp)
+
+    set(SIM_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
+    set(SIM_COMPILER_FLAGS
+        $<$<COMPILE_LANGUAGE:CXX>:-Wshadow
+        -Wall
+        -Wcast-qual
+        -fno-rtti
+        -fno-exceptions
+        -std=c++17
+        ${OpenMP_CXX_FLAGS}>)
+
+    add_executable(${SIM_NAME} ${SIM_SOURCE})
+    target_link_libraries(${SIM_NAME} PUBLIC simulation_common armral
+                                             armral_awgn armral_utils)
+    target_link_libraries(${SIM_NAME} PRIVATE OpenMP::OpenMP_CXX)
+    target_compile_options(
+      ${SIM_NAME} PRIVATE ${SIM_COMPILE_OPTIONS} ${SIM_COMPILER_FLAGS}
+                          "$<$<CONFIG:DEBUG>:-Og>")
 
-    # If no Threads is found we simply do not create the ldpc_awgn targets.
-    # This is useful when building on bare-metal where the concept of threads
-    # does not exist but we still want to build the library.
-    MESSAGE(STATUS "Threads not found, skip ldpc_awgn target.")
+    add_dependencies(simulation ${SIM_NAME})
 
+    if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
+      # Add test for the simulation executable. At present this just checks that
+      # the executable can be successfully invoked with a set of valid inputs.
+      # We do not check the validity of the output. We also only run this if we
+      # are not using a test running wrapper.
+      add_test(NAME ${SIM_NAME} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}
+                                        ${SIM_CMD_LINE_OPTS})
+      set_tests_properties(${SIM_NAME} PROPERTIES TIMEOUT 3000)
+      add_dependencies(check ${SIM_NAME})
+    endif()
+
+  endfunction()
+
+  add_armral_sim(convolutional_awgn "-k;8;-m;0;-u;128")
+  add_armral_sim(ldpc_awgn "-z;3;-b;1;-m;0;-r;0;-u;128")
+  add_armral_sim(modulation_awgn "-k;32;-m;0;-u;128")
+  add_armral_sim(polar_awgn "-k;32;-e;32;-l;1;-m;0;-i;0;-u;128")
+  add_armral_sim(turbo_awgn "-k;40;-m;0;-i;1;-e;60")
+
+else()
+  # If no Threads is found we simply do not create the simulation targets. This
+  # is useful when building on bare-metal where the concept of threads does not
+  # exist but we still want to build the library.
+  message(STATUS "Threads not found, skip simulation target.")
 endif()
 
 if(BUILD_TESTING)
-    # Build simulation as part of the main "check" target
-    add_dependencies(check simulation)
+  # Build simulation as part of the main "check" target
+  add_dependencies(check simulation)
 endif()
diff --git a/simulation/README.md b/simulation/README.md
index 77829a4..8453b2a 100644
--- a/simulation/README.md
+++ b/simulation/README.md
@@ -1,7 +1,6 @@
-Get started with ArmRAL noisy channel simulation
-================================================
+# Get started with ArmRAL noisy channel simulation
 
-# Introduction
+## Introduction
 
 This directory contains utilities and programs that you can use to evaluate the
 error-correction performance of the coding schemes provided in Arm RAN
@@ -16,9 +15,9 @@ find a mathematical description of the AWGN which is simulated. The definition
 of what is meant by bit and block error rates is then given, and we conclude with
 instructions for how to use the utilities contained in this folder.
 
-# Additive White Gaussian Noise (AWGN) Simulation
+## Additive White Gaussian Noise (AWGN) Simulation
 
-## Using simulated noise
+### Using simulated noise
 
 Noisy channels are simulated by adding noise to the symbols generated by the
 modulation routine. This simulates that a signal is sent over a noisy network.
@@ -60,7 +59,7 @@ for spectral efficiency `rho`. To calculate the spectral efficiency, the
 modulation scheme and bandwidth of the channel must be known, and passed to the
 simulation program.
 
-## Further assumptions
+### Further assumptions
 
 The simulation programs follow the description of coding and modulation schemes
 provided in 3GPP Technical Specification (TS) 36.12, Section 5.1.3 (for Turbo
@@ -75,7 +74,7 @@ and Polar coding). We make the following further assumptions:
 
 3. No Cyclic Redundancy Check (CRC) is performed.
 
-## Bit and block error rates
+### Bit and block error rates
 
 The simulator computes the error rates in terms of bits or blocks by comparing
 the input bits of encoding and the output decoded bits. The input bits are
@@ -93,7 +92,7 @@ block with at least one incorrectly decoded bit.
 
       bler = nbl / number_of_blocks
 
-# Get started with simulation programs
+## Get started with simulation programs
 
 **Note:** To compile and execute the simulation programs, you must have built
 ArmRAL with the `-DBUILD_SIMULATION=On` CMake option.
@@ -107,8 +106,9 @@ The following assumes that you are running commands from the build directory.
 The built binaries can be found in the `simulation` subdirectory of the build
 directory.
 
-In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
-`ldpc`, or `modulation` for simulations without using a coding scheme.
+In the following, `<code>` can be one of the supported coding schemes
+(`convolutional`, `ldpc`, `polar` or `turbo`). Set `<code>` to `modulation` for
+simulation without a coding scheme.
 
 * To build the AWGN channel simulation for a given coding scheme `<code>`, use:
 
@@ -116,12 +116,12 @@ In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
 
 * To run the AWGN channel simulation for `<code>` with arguments `<args>`, use:
 
-      ./simulation/<code>_awgn/<code>_awgn <args>
+      ./simulation/<code>_awgn <args>
 
 * To get a list of possible input arguments and associated documentation, use
   the same command without arguments:
 
-      ./simulation/<code>_awgn/<code>_awgn
+      ./simulation/<code>_awgn
 
 * Executing a simulation will write JSON output to stdout. The output contains
   information on the observed bit and block error rates for the input
@@ -129,7 +129,7 @@ In the following, the coding scheme `<code>` must be one of `polar`, `turbo`,
   use of the Python scripts described in the section on drawing performance
   charts.
 
-# Modulation schemes
+## Modulation schemes
 
 All simulators use modulation and demodulation, respectively, before and after
 adding noise to the channel.
@@ -153,7 +153,7 @@ the range of the generated log-likelihood ratios (LLRs). A default value for
 find that the best performance of decoding relies on a good choice of
 `<demod_ulp>`, and you are encouraged to provide a value for this parameter.
 
-# Simulation program for modulation
+## Simulation program for modulation
 
 The program `modulation_awgn` simulates the transmission of data without
 performing any forward error correction. Data is modulated, then has
@@ -179,7 +179,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-# Simulation programs for individual coding schemes
+## Simulation programs for individual coding schemes
 
 In this section, we give the definition of some parameters used in the programs
 associated with each coding scheme.
@@ -189,12 +189,13 @@ help text use
 
       <sim_name> --help
 
-where `<sim_name>` is one of `polar_awgn`, `turbo_awgn`, or `ldpc_awgn`. The
-help text of the programs gives more detailed descriptions on the parameters
-than you will find in the sections below. The information below helps you to
-run the simulation programs and understand their output.
+where `<sim_name>` is one of `polar_awgn`, `turbo_awgn`, `ldpc_awgn`, or
+`convolutional_awgn`. The help text of the programs gives more detailed
+descriptions on the parameters than you will find in the sections below. The
+information below helps you to run the simulation programs and understand their
+output.
 
-## Polar Codes
+### Polar Codes
 
 You can run the `polar` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -219,7 +220,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Turbo Codes
+### Turbo Codes
 
 You can run the `turbo` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -241,7 +242,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Low-Density Parity-Check (LDPC) Codes
+### Low-Density Parity-Check (LDPC) Codes
 
 You can run the `LDPC` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -265,7 +266,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-## Tail-biting Convolutional Codes
+### Tail-biting Convolutional Codes
 
 You can run the `convolutional` coding Additive White Gaussian Noise (AWGN) simulation
 with the following parameters:
@@ -286,7 +287,7 @@ The JSON record contains the following fields:
         "ber": <ber>
       }
 
-# Drawing performance charts
+## Drawing performance charts
 
 The simulator allows users to evaluate the performance of a coding scheme. In
 the context of noisy channels, performance is evaluated in terms of output
@@ -318,7 +319,7 @@ scripts requires a recent version of Python. ArmRAL has been tested with Python
 
       ./simulation/<code>_awgn/<code>_error_rate.py --help
 
-# Drawing capacity charts
+## Drawing capacity charts
 
 The simulator allows users to draw the data rates of each modulation and compare
 them to the capacity of the AWGN channel (the Shannon limit).
diff --git a/simulation/awgn/CMakeLists.txt b/simulation/awgn/CMakeLists.txt
index be9a7ca..b5f5522 100644
--- a/simulation/awgn/CMakeLists.txt
+++ b/simulation/awgn/CMakeLists.txt
@@ -1,19 +1,15 @@
 cmake_minimum_required(VERSION 3.3)
 project(awgn VERSION 0.0)
 
-set(AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/awgn.cpp
-)
+set(AWGN_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/awgn.cpp)
 
 set(AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17>)
+set(AWGN_COMPILER_FLAGS $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual
+                        -fno-rtti -fno-exceptions -std=c++17>)
 
 add_library(armral_awgn ${AWGN_SOURCES})
 target_include_directories(armral_awgn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(armral_awgn PUBLIC armral armral_utils)
-target_compile_options(armral_awgn PRIVATE
-  ${AWGN_COMPILE_OPTIONS}
-  ${AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
+target_compile_options(
+  armral_awgn PRIVATE ${AWGN_COMPILE_OPTIONS} ${AWGN_COMPILER_FLAGS}
+                      "$<$<CONFIG:DEBUG>:-Og>")
diff --git a/simulation/awgn/awgn.cpp b/simulation/awgn/awgn.cpp
index 6ae035c..cfd7652 100644
--- a/simulation/awgn/awgn.cpp
+++ b/simulation/awgn/awgn.cpp
@@ -1,18 +1,20 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
-#include "awgn.h"
+#include "awgn.hpp"
 #include "rng.hpp"
 
 #include <algorithm>
 #include <cmath>
 
+namespace armral::simulation {
+
 /*
- * Return a random doubleing-point number in the range [-1, 1).
+ * Return a random double floating-point number in the range [-1, 1).
  */
-static double sample_double_unit(random_state *state) {
-  linear_congruential_generator lcg;
+static double sample_double_unit(armral::utils::random_state *state) {
+  armral::utils::linear_congruential_generator lcg;
   return lcg.one<double>(state) * 2.0 - 1.0;
 }
 
@@ -20,7 +22,7 @@ static double sample_double_unit(random_state *state) {
  * Return a number taken from a normal distribution with mean=0 and
  * the specified stddev.
  */
-static double sample_normal(random_state *state, double sigma) {
+static double sample_normal(armral::utils::random_state *state, double sigma) {
   double u;
   double r;
   do {
@@ -38,8 +40,9 @@ static double sample_normal(random_state *state, double sigma) {
  * decibels, and deduce the random field required to produce noise of the
  * appropriate power (mean square amplitude). This noise is added to the signal.
  */
-void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
-              armral_fixed_point_index frac_bits, armral_cmplx_int16_t *xs) {
+void add_awgn(armral::utils::random_state *state, int num_mod_symbols,
+              double snr_db, armral_fixed_point_index frac_bits,
+              armral_cmplx_int16_t *xs) {
 
   //    snr_db = 10 * log_10(s / r)
   // => r = 10^(-snr/10)
@@ -92,3 +95,5 @@ double ebn0_to_snr(double coding_rate, int bits_per_symb, double symb_rate,
 
   return snr_db;
 }
+
+} // namespace armral::simulation
diff --git a/simulation/awgn/awgn.h b/simulation/awgn/awgn.hpp
similarity index 82%
rename from simulation/awgn/awgn.h
rename to simulation/awgn/awgn.hpp
index ad8c8fe..ba978e8 100644
--- a/simulation/awgn/awgn.h
+++ b/simulation/awgn/awgn.hpp
@@ -1,12 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include "armral.h"
 #include "rng.hpp"
 
+namespace armral::simulation {
+
 /*
  * Add noise to a channel where it is assumed that the power (mean square
  * amplitude) of the signal is 1. We target a specific signal to noise ratio in
@@ -21,8 +23,9 @@
  * param [in,out] xs              On input, the signal to add noise to. On
  *                                output the signal, disturbed by random noise.
  */
-void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
-              armral_fixed_point_index frac_bits, armral_cmplx_int16_t *xs);
+void add_awgn(armral::utils::random_state *state, int num_mod_symbols,
+              double snr_db, armral_fixed_point_index frac_bits,
+              armral_cmplx_int16_t *xs);
 
 /*
  * Compute the SNR in db given the coding rate, the bits per symbol, the
@@ -37,3 +40,5 @@ void add_awgn(random_state *state, int num_mod_symbols, double snr_db,
  */
 double ebn0_to_snr(double coding_rate, int bits_per_symb, double symb_rate,
                    double bw, double ebn0_db);
+
+} // namespace armral::simulation
diff --git a/simulation/capacity/capacity.py b/simulation/capacity/capacity.py
index 4b331ad..496028f 100755
--- a/simulation/capacity/capacity.py
+++ b/simulation/capacity/capacity.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from math import sqrt, exp, pi, log
diff --git a/simulation/convolutional_awgn/CMakeLists.txt b/simulation/convolutional_awgn/CMakeLists.txt
deleted file mode 100644
index 14faf79..0000000
--- a/simulation/convolutional_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(convolutional_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(CONV_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/convolutional_awgn.cpp
-)
-
-set(CONV_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(CONV_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(convolutional_awgn ${CONV_AWGN_SOURCES})
-target_link_libraries(convolutional_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(convolutional_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(convolutional_awgn PRIVATE
-  ${CONV_AWGN_COMPILE_OPTIONS}
-  ${CONV_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation convolutional_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME convolutional_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/convolutional_awgn -k 8 -m 0 -u 128)
-  set_tests_properties(convolutional_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check convolutional_awgn)
-endif()
diff --git a/simulation/convolutional_awgn/convolutional_awgn.cpp b/simulation/convolutional_awgn/convolutional_awgn.cpp
index 294b5b3..6c74c89 100644
--- a/simulation/convolutional_awgn/convolutional_awgn.cpp
+++ b/simulation/convolutional_awgn/convolutional_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -28,7 +28,7 @@ void usage(const char *exe_name) {
       << "The arguments required by " << exe_name << " are:\n\n"
       << "  <num_bits>  Number of bits in the encoded message.\n"
       << "  <mod_type>  Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(2)
+      << armral::simulation::print_valid_mod_type(2)
       << "  <demod_ulp> Scaling parameter used in demodulation when\n"
       << "              using fixed-point Q2.13 representation for symbols.\n"
       << "              <demod_ulp> is an integer.\n"
@@ -64,7 +64,7 @@ struct convolutional_example_data {
 
   convolutional_example_data(uint32_t k, armral_modulation_type mod) {
     mod_type = mod;
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     len_in = k;
     len_encoded = k;
     len_out = k;
@@ -108,13 +108,13 @@ struct convolutional_example_data {
 
 // Perform an end-to-end encoding, modulation, transmission, demodulation, and
 // decoding and count the number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               uint32_t iter_max, convolutional_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -137,12 +137,12 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data2_encoded, data->data2_mod);
 
   // AWGN channel effects - add some noise to all the encoded bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data0_mod);
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data1_mod);
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data2_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data0_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data1_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data2_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -175,7 +175,7 @@ struct sim_result {
   sim_result(uint32_t k_in, armral_modulation_type mod, uint32_t ulp_in,
              double ebn0_in, double snr_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages, uint32_t iter_max_in)
-    : k(k_in), mod_type(armral_simulation::mod_to_str(mod)), ulp(ulp_in),
+    : k(k_in), mod_type(armral::simulation::mod_to_str(mod)), ulp(ulp_in),
       ebn0(ebn0_in), snr(snr_in), bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)), iter_max(iter_max_in) {
   }
@@ -204,7 +204,7 @@ struct sim_result {
 bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
              uint16_t ulp, double ebn0_db) {
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding ratio (k/n) of the LTE convolutional codes
   // is 1/3, see 3GPP TS 36.212
   double coding_rate = 1.0 / 3.0;
@@ -214,8 +214,8 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -228,7 +228,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
       convolutional_example_data data(k, mod_type);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, snr_db, ulp, iter_max, &data);
         nb += num_bit_errors;
@@ -253,7 +253,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   uint16_t ulp = 0;
   uint32_t iter_max = 0;
@@ -292,10 +292,10 @@ int main(int argc, char **argv) {
         << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/convolutional_awgn/convolutional_error_rate.py b/simulation/convolutional_awgn/convolutional_error_rate.py
index 1bd7e71..0a887e6 100755
--- a/simulation/convolutional_awgn/convolutional_error_rate.py
+++ b/simulation/convolutional_awgn/convolutional_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/include/simulation_common.hpp b/simulation/include/simulation_common.hpp
index ecced32..3a36b98 100644
--- a/simulation/include/simulation_common.hpp
+++ b/simulation/include/simulation_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -10,7 +10,7 @@
 
 #define SNEW(T, sz) ((T *)calloc(sz, sizeof(T)))
 
-namespace armral_simulation {
+namespace armral::simulation {
 
 uint8_t bits_per_symbol(armral_modulation_type mod_type) {
   uint8_t nb_bits;
@@ -72,4 +72,4 @@ std::string print_valid_mod_type(int num_tabs) {
   return os.str();
 }
 
-} // namespace armral_simulation
+} // namespace armral::simulation
diff --git a/simulation/include/simulation_common.py b/simulation/include/simulation_common.py
index f062f40..1500149 100755
--- a/simulation/include/simulation_common.py
+++ b/simulation/include/simulation_common.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from dataclasses import dataclass
 from datetime import datetime
diff --git a/simulation/ldpc_awgn/CMakeLists.txt b/simulation/ldpc_awgn/CMakeLists.txt
deleted file mode 100644
index bfba143..0000000
--- a/simulation/ldpc_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(ldpc_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(LDPC_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/ldpc_awgn.cpp
-)
-
-set(LDPC_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(LDPC_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(ldpc_awgn ${LDPC_AWGN_SOURCES})
-target_link_libraries(ldpc_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(ldpc_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(ldpc_awgn PRIVATE
-  ${LDPC_AWGN_COMPILE_OPTIONS}
-  ${LDPC_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation ldpc_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME ldpc_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ldpc_awgn -z 3 -b 1 -m 0 -r 0 -u 128)
-  set_tests_properties(ldpc_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check ldpc_awgn)
-endif()
diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp
index 9df77ae..19db5f7 100644
--- a/simulation/ldpc_awgn/ldpc_awgn.cpp
+++ b/simulation/ldpc_awgn/ldpc_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -91,7 +91,7 @@ void usage(const char *exe_name) {
       << "                       values are:\n"
       << "                       " << print_valid_base_graph()
       << "  <mod_type>           Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <redundancy_version> The redundancy version to be used. Supported\n"
       << "                       values are:\n"
       << "                       " << print_valid_redundancy_version()
@@ -142,7 +142,7 @@ struct ldpc_example_data {
     data_in_bytes = SNEW(uint8_t, 2 * z);
     data_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     data_encoded_bytes = SNEW(uint8_t, len_encoded);
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     nref = 0;
     len_rate_matched =
         bit_per_symbol * ((len_encoded + bit_per_symbol - 1) / bit_per_symbol);
@@ -169,14 +169,14 @@ struct ldpc_example_data {
   }
 };
 
-int run_check(random_state *state, uint32_t z, armral_ldpc_graph_t bg,
-              uint32_t rv, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, uint32_t z,
+              armral_ldpc_graph_t bg, uint32_t rv, double snr_db, uint32_t ulp,
               ldpc_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < (data->len_in - data->len_filler_bits); ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -204,8 +204,8 @@ int run_check(random_state *state, uint32_t z, armral_ldpc_graph_t bg,
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -255,9 +255,9 @@ struct sim_result {
              armral_modulation_type mod, uint32_t rv_in, double ebn0_in,
              double snr_in, uint16_t ulp_in, uint16_t filler_bits_len,
              uint32_t nb, uint32_t nm, uint32_t num_messages)
-    : n(n_in), bg((int)bg_in + 1), mod_type(armral_simulation::mod_to_str(mod)),
-      rv(rv_in), ebn0(ebn0_in), snr(snr_in), ulp(ulp_in),
-      len_filler_bits(filler_bits_len),
+    : n(n_in), bg((int)bg_in + 1),
+      mod_type(armral::simulation::mod_to_str(mod)), rv(rv_in), ebn0(ebn0_in),
+      snr(snr_in), ulp(ulp_in), len_filler_bits(filler_bits_len),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * n_in)) {}
 
@@ -290,7 +290,7 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
              uint16_t len_filler_bits, double ebn0_db) {
   const auto *graph = armral_ldpc_get_base_graph(bg);
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding rate (k/n) for LDPC base graph 1 is 1/3 (k = 22 Z, n = 66 Z)
   // and 1/5 for LDPC base graph 2 (k = 10 Z, n = 50 Z), see 3GPP TS 38.212
   double coding_rate;
@@ -306,8 +306,8 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -321,7 +321,7 @@ bool run_snr(uint32_t z, armral_modulation_type mod_type,
       ldpc_example_data data(z, mod_type, graph, len_filler_bits);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, z, bg, rv, snr_db, ulp, &data);
         nb += num_bit_errors;
@@ -405,10 +405,10 @@ int main(int argc, char **argv) {
               << "\t" << print_valid_base_graph() << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/ldpc_awgn/ldpc_error_rate.py b/simulation/ldpc_awgn/ldpc_error_rate.py
index 32e75e3..0eb6643 100755
--- a/simulation/ldpc_awgn/ldpc_error_rate.py
+++ b/simulation/ldpc_awgn/ldpc_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/modulation_awgn/CMakeLists.txt b/simulation/modulation_awgn/CMakeLists.txt
deleted file mode 100644
index c30886a..0000000
--- a/simulation/modulation_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(modulation_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(MODULATION_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/modulation_awgn.cpp
-)
-
-set(MODULATION_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(MODULATION_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(modulation_awgn ${MODULATION_AWGN_SOURCES})
-target_link_libraries(modulation_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(modulation_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(modulation_awgn PRIVATE
-  ${MODULATION_AWGN_COMPILE_OPTIONS}
-  ${MODULATION_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation modulation_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME modulation_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/modulation_awgn -k 32 -m 0 -u 128)
-  set_tests_properties(modulation_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check modulation_awgn)
-endif()
diff --git a/simulation/modulation_awgn/modulation_awgn.cpp b/simulation/modulation_awgn/modulation_awgn.cpp
index 426c90e..bd493e5 100644
--- a/simulation/modulation_awgn/modulation_awgn.cpp
+++ b/simulation/modulation_awgn/modulation_awgn.cpp
@@ -1,8 +1,8 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -18,7 +18,7 @@ void usage(const char *exe_name) {
       << "The arguments required by " << exe_name << " are:\n\n"
       << "  <num_bits>    Number of bits in the encoded message.\n"
       << "  <mod_type>    Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(2)
+      << armral::simulation::print_valid_mod_type(2)
       << "  <demod_ulp> Scaling parameter used in demodulation when\n"
       << "              using fixed-point Q2.13 representation for symbols.\n"
       << "              <demod_ulp> is an integer such that the symbol\n"
@@ -40,7 +40,7 @@ struct example_data {
 
   example_data(uint32_t k, armral_modulation_type mod) {
     mod_type = mod;
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     len_in = k;
     num_mod_symbols = (len_in + bit_per_symbol - 1) / bit_per_symbol;
     data_in = SNEW(uint8_t, (len_in + 7) / 8);
@@ -59,13 +59,13 @@ struct example_data {
 
 // Perform an end-to-end modulation, transmission, demodulation, and count the
 // number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -79,8 +79,8 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data_in, data->data_mod);
 
   // AWGN channel effects - add some noise to all the modulated bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -102,7 +102,7 @@ struct sim_result {
   sim_result(uint32_t k_in, armral_modulation_type mod, uint32_t ulp_in,
              double ebn0_in, double snr_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages)
-    : k(k_in), mod_type(armral_simulation::mod_to_str(mod)), ulp(ulp_in),
+    : k(k_in), mod_type(armral::simulation::mod_to_str(mod)), ulp(ulp_in),
       ebn0(ebn0_in), snr(snr_in), bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
 
@@ -129,7 +129,7 @@ struct sim_result {
 bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
              double ebn0_db) {
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   // The coding rate is used to convert from Eb/N0 to SNR. This program doesn't
   // use a coding scheme, so the number of output bits is equal to the number of
   // input bits.
@@ -140,8 +140,8 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -154,7 +154,7 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
       example_data data(k, mod_type);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors = run_check(&state, snr_db, ulp, &data);
         nb += num_bit_errors;
         num_message_errors += num_bit_errors == 0 ? 0 : 1;
@@ -178,7 +178,7 @@ bool run_snr(uint32_t k, armral_modulation_type mod_type, uint16_t ulp,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   uint16_t ulp = 0;
   armral_modulation_type mod_type = ARMRAL_MOD_256QAM;
@@ -212,10 +212,10 @@ int main(int argc, char **argv) {
               << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
 
diff --git a/simulation/modulation_awgn/modulation_error_rate.py b/simulation/modulation_awgn/modulation_error_rate.py
index c5b72ea..14ff20c 100755
--- a/simulation/modulation_awgn/modulation_error_rate.py
+++ b/simulation/modulation_awgn/modulation_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 import pandas as pd
diff --git a/simulation/polar_awgn/CMakeLists.txt b/simulation/polar_awgn/CMakeLists.txt
deleted file mode 100644
index 249b4bc..0000000
--- a/simulation/polar_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(polar_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(POLAR_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/polar_awgn.cpp
-)
-
-set(POLAR_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(POLAR_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(polar_awgn ${POLAR_AWGN_SOURCES})
-target_link_libraries(polar_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(polar_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(polar_awgn PRIVATE
-  ${POLAR_AWGN_COMPILE_OPTIONS}
-  ${POLAR_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation polar_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME polar_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/polar_awgn -k 32 -e 32 -l 1 -m 0 -i 0 -u 128)
-  set_tests_properties(polar_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check polar_awgn)
-endif()
diff --git a/simulation/polar_awgn/polar_awgn.cpp b/simulation/polar_awgn/polar_awgn.cpp
index 76241f0..c25aa41 100644
--- a/simulation/polar_awgn/polar_awgn.cpp
+++ b/simulation/polar_awgn/polar_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -56,7 +56,7 @@ void usage(const char *exe_name) {
       << "                    this program expects <num_trans_bits> greater\n"
       << "                    than or equal to <num_info_bits>.\n"
       << "  <mod_type>        Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <i_bil>           Flag to enable/disable the interleaving of \n"
       << "                    coded bits in Polar rate matching.\n"
       << "                    Type 0 : Downlink, Type 1 : Uplink.\n"
@@ -129,7 +129,7 @@ struct polar_example_data {
     i_bil = i_bil_in;
     mod_type = mod;
     demod_ulp = demod_ulp_in;
-    bits_per_mod_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bits_per_mod_symbol = armral::simulation::bits_per_symbol(mod_type);
     // We want to encode k bits of data
     data_in = SNEW(uint8_t, (k + 7) / 8);
     // Codeword length is n, which is what we interleave
@@ -171,15 +171,16 @@ struct polar_example_data {
   }
 };
 
-int run_check(random_state *state, double snr_db, polar_example_data *data) {
+int run_check(armral::utils::random_state *state, double snr_db,
+              polar_example_data *data) {
 
   uint32_t crc_bits = 24;                 // CRC-24 (L = 24)
   uint32_t msg_bits = data->k - crc_bits; // message length (A = K - L)
 
   std::vector<uint8_t> msg((msg_bits + 7) / 8);
   for (uint32_t i = 0; i < msg_bits; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -216,8 +217,8 @@ int run_check(random_state *state, double snr_db, polar_example_data *data) {
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   armral_demodulation(data->num_mod_symbols, data->demod_ulp, data->mod_type,
                       data->data_mod, data->data_demod_soft);
@@ -271,7 +272,7 @@ struct sim_result {
              uint16_t ulp_in, double ebn0_in, double snr_in, uint32_t nb,
              uint32_t nm, uint32_t num_messages)
     : len(n), e(e_in), k(k_in), l(l_in),
-      mod_type(armral_simulation::mod_to_str(mod)), i_bil(i_bil_in),
+      mod_type(armral::simulation::mod_to_str(mod)), i_bil(i_bil_in),
       ulp(ulp_in), ebn0(ebn0_in), snr(snr_in),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
@@ -306,7 +307,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
              uint16_t ulp, double ebn0_db) {
   uint32_t n = get_codeword_length(e, k);
   // Compute SNR in dB
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   double coding_rate = (double)k / n;
   double bw = 1e6; // Bandwidth (B) = 1 MHz
   // The symbol rate R [symbols/s] is proportional to the bandwidth. For
@@ -314,8 +315,8 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   int nb = 0;
   uint64_t nr_total = 0;
@@ -327,7 +328,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
       polar_example_data data(n, e, k, l, i_bil, mod_type, ulp);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors = run_check(&state, snr_db, &data);
         nb += num_bit_errors;
         num_message_errors += num_bit_errors == 0 ? 0 : 1;
@@ -348,7 +349,7 @@ bool run_snr(uint32_t e, uint32_t k, uint32_t l,
 
 int main(int argc, char **argv) {
 
-  // Initialisation
+  // Initialization
   uint32_t k = 0;
   bool is_k_set = false;
   uint32_t e = 0;
@@ -408,10 +409,10 @@ int main(int argc, char **argv) {
               << "\t" << print_valid_l() << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
   if (!is_i_bil_set) {
diff --git a/simulation/polar_awgn/polar_error_rate.py b/simulation/polar_awgn/polar_error_rate.py
index 5cd4234..f8a76cb 100755
--- a/simulation/polar_awgn/polar_error_rate.py
+++ b/simulation/polar_awgn/polar_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/turbo_awgn/CMakeLists.txt b/simulation/turbo_awgn/CMakeLists.txt
deleted file mode 100644
index 0f6389a..0000000
--- a/simulation/turbo_awgn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-cmake_minimum_required(VERSION 3.3)
-project(turbo_awgn VERSION 0.0)
-
-if (NOT OpenMP_CXX_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
-    message(FATAL_ERROR "OpenMP flags not specified. Please invoke CMake from the simulation directory")
-endif()
-
-set(TURBO_AWGN_SOURCES
-  ${CMAKE_CURRENT_SOURCE_DIR}/turbo_awgn.cpp
-)
-
-set(TURBO_AWGN_COMPILE_OPTIONS ${ARMRAL_ARCH_COMPILE_OPTIONS})
-set(TURBO_AWGN_COMPILER_FLAGS
-  $<$<COMPILE_LANGUAGE:CXX>:-Wshadow -Wall -Wcast-qual -fno-rtti -fno-exceptions -std=c++17 ${OpenMP_CXX_FLAGS}>)
-
-add_executable(turbo_awgn ${TURBO_AWGN_SOURCES})
-target_link_libraries(turbo_awgn PUBLIC simulation_common armral armral_awgn armral_utils)
-target_link_libraries(turbo_awgn PRIVATE OpenMP::OpenMP_CXX)
-target_compile_options(turbo_awgn PRIVATE
-  ${TURBO_AWGN_COMPILE_OPTIONS}
-  ${TURBO_AWGN_COMPILER_FLAGS}
-  "$<$<CONFIG:DEBUG>:-Og>"
-)
-
-add_dependencies(simulation turbo_awgn)
-
-if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
-  # Add test for the simulation executable
-  # At present this just checks that the executable can be successfully invoked with
-  # a set of valid inputs. We do not check the validity of the output.
-  # We also only run this if we are not using a test running wrapper.
-  add_test(NAME turbo_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/turbo_awgn -k 40 -m 0 -i 1 -e 60)
-  set_tests_properties(turbo_awgn PROPERTIES TIMEOUT 3000)
-  add_dependencies(check turbo_awgn)
-endif()
diff --git a/simulation/turbo_awgn/turbo_awgn.cpp b/simulation/turbo_awgn/turbo_awgn.cpp
index 29c8bdb..7dbc271 100644
--- a/simulation/turbo_awgn/turbo_awgn.cpp
+++ b/simulation/turbo_awgn/turbo_awgn.cpp
@@ -1,9 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
-#include "awgn.h"
+#include "awgn.hpp"
 #include "bit_utils.hpp"
 #include "simulation_common.hpp"
 
@@ -90,7 +90,7 @@ void usage(const char *exe_name) {
       << "                     This must be one of:\n"
       << print_valid_block_size("\t\t\t")
       << "  <mod_type>         Type of modulation. Supported values are:\n"
-      << armral_simulation::print_valid_mod_type(3)
+      << armral::simulation::print_valid_mod_type(3)
       << "  <num_matched_bits> Number of bits in the rate-matched message.\n"
       << "  <rv>               The redundancy version used for rate matching\n"
       << "                     and recovery. Supported values are:\n"
@@ -145,7 +145,7 @@ struct turbo_example_data {
     par_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     itl_encoded = SNEW(uint8_t, (len_encoded + 7) / 8);
     data_matched = SNEW(uint8_t, (len_matched + 7) / 8);
-    bit_per_symbol = armral_simulation::bits_per_symbol(mod_type);
+    bit_per_symbol = armral::simulation::bits_per_symbol(mod_type);
     num_mod_symbols = (len_matched + bit_per_symbol - 1) / bit_per_symbol;
     data_mod = SNEW(armral_cmplx_int16_t, num_mod_symbols);
     sys_recovered = SNEW(int8_t, len_encoded);
@@ -175,13 +175,13 @@ struct turbo_example_data {
 
 // Perform an end-to-end encoding, rate matching, modulation, transmission,
 // demodulation, rate recovery, and decoding and count the number of errors
-int run_check(random_state *state, double snr_db, uint32_t ulp,
+int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp,
               uint32_t iter_max, turbo_example_data *data) {
   // Init data
   memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t));
   for (uint32_t i = 0; i < data->len_in; ++i) {
-    uint8_t bit =
-        static_cast<uint8_t>(linear_congruential_generator{}.one<bool>(state));
+    uint8_t bit = static_cast<uint8_t>(
+        armral::utils::linear_congruential_generator{}.one<bool>(state));
     uint16_t byte_ind = i / 8;
     // The most significant bit is the first bit (in wire order). Not sure if
     // that is an issue with randomly generated data, but we are paying
@@ -206,8 +206,8 @@ int run_check(random_state *state, double snr_db, uint32_t ulp,
                     data->mod_type, data->data_matched, data->data_mod);
 
   // AWGN channel effects - add some noise to all the encoded bits
-  add_awgn(state, data->num_mod_symbols, snr_db, ARMRAL_FIXED_POINT_INDEX_Q2_13,
-           data->data_mod);
+  armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db,
+                               ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod);
 
   // Run demodulation
   armral_demodulation(data->num_mod_symbols, ulp, data->mod_type,
@@ -251,7 +251,7 @@ struct sim_result {
              uint32_t ulp_in, double ebn0_in, double snr_in,
              uint32_t iter_max_in, uint32_t nb, uint32_t nm,
              uint32_t num_messages)
-    : k(k_in), e(e_in), mod_type(armral_simulation::mod_to_str(mod)),
+    : k(k_in), e(e_in), mod_type(armral::simulation::mod_to_str(mod)),
       ulp(ulp_in), ebn0(ebn0_in), snr(snr_in), iter_max(iter_max_in),
       bler(static_cast<double>(nm) / num_messages),
       ber(static_cast<double>(nb) / (num_messages * k)) {}
@@ -284,15 +284,15 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
   // The coding rate is the ratio of input information bits, k, to the number of
   // rate-matched bits, e.
   double coding_rate = (double)k / e;
-  int bits_per_symb = armral_simulation::bits_per_symbol(mod_type);
+  int bits_per_symb = armral::simulation::bits_per_symbol(mod_type);
   double bw = 1e6; // Bandwidth (B) = 1 MHz
   // The symbol rate R [symbols/s] is proportional to the bandwidth. For
   // passband transmission using QAM modulation the maximum spectral efficiency
   // is equal to the number of bits per symbol. To meet this criteria we take
   // the symbol rate equal to the bandwidth.
   double symb_rate = bw;
-  double snr_db =
-      ebn0_to_snr(coding_rate, bits_per_symb, symb_rate, bw, ebn0_db);
+  double snr_db = armral::simulation::ebn0_to_snr(coding_rate, bits_per_symb,
+                                                  symb_rate, bw, ebn0_db);
 
   double tolerance = 1.0e-9;
   int nb = 0;
@@ -305,7 +305,7 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type,
       turbo_example_data data(k, mod_type, e, rv);
 #pragma omp for
       for (uint64_t r = 0; r < nr; ++r) {
-        auto state = random_state::from_seeds({r, nr_total});
+        auto state = armral::utils::random_state::from_seeds({r, nr_total});
         uint32_t num_bit_errors =
             run_check(&state, snr_db, ulp, iter_max, &data);
         nb += num_bit_errors;
@@ -379,10 +379,10 @@ int main(int argc, char **argv) {
               << print_valid_block_size("\t") << std::endl;
     print_usage = true;
   }
-  if (!is_mod_set || !armral_simulation::is_valid_mod_type(mod_type)) {
+  if (!is_mod_set || !armral::simulation::is_valid_mod_type(mod_type)) {
     std::cerr << "Modulation type is invalid or not specified.\n"
               << "Must be one of:\n"
-              << armral_simulation::print_valid_mod_type(1) << std::endl;
+              << armral::simulation::print_valid_mod_type(1) << std::endl;
     print_usage = true;
   }
   if (!is_e_set) {
diff --git a/simulation/turbo_awgn/turbo_error_rate.py b/simulation/turbo_awgn/turbo_error_rate.py
index 6725cdf..51cd9fd 100755
--- a/simulation/turbo_awgn/turbo_error_rate.py
+++ b/simulation/turbo_awgn/turbo_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
index 5c889e1..c3941c2 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -407,11 +407,11 @@ void invert_hermitian_matrix<3>(const armral_cmplx_f32_t *__restrict p_src,
   armral_cmplx_f32_t adj21 = {adj12.re, -adj12.im};
 
   // Determinant (real): A_{0:} * adj(A)_{:0}
-  float det00 = a00.re * adj00.re;
-  float det11 = scal_mul_cmplx_f32(a01, adj10).re;
-  float det22 = scal_mul_cmplx_f32(a02, adj20).re;
+  float32_t det00 = a00.re * adj00.re;
+  float32_t det11 = scal_mul_cmplx_f32(a01, adj10).re;
+  float32_t det22 = scal_mul_cmplx_f32(a02, adj20).re;
 
-  float inv_det = 1.0F / (det00 + det11 + det22);
+  float32_t inv_det = 1.0F / (det00 + det11 + det22);
 
   // Write into output array
   p_dst[0] = {adj00.re * inv_det, adj00.im * inv_det};
@@ -646,8 +646,8 @@ invert_batch_hermitian_matrix_3x3(uint32_t num_mats,
 
   assert(num_mats % 4 == 0);
 
-  const float *src = (const float *)p_src;
-  float *dst = (float *)p_dst;
+  const float32_t *src = (const float32_t *)p_src;
+  float32_t *dst = (float32_t *)p_dst;
   uint32_t stride = num_mats * 2;
 
   for (uint32_t mat_i = 0; mat_i + 4 <= num_mats; mat_i += 4) {
@@ -783,22 +783,22 @@ invert_batch_hermitian_matrix_3x3_pa(
 
   assert(num_mats % 4 == 0);
 
-  const float *src_00 = (const float *)p_srcs[0];
-  const float *src_01 = (const float *)p_srcs[1];
-  const float *src_02 = (const float *)p_srcs[2];
-  const float *src_11 = (const float *)p_srcs[4];
-  const float *src_12 = (const float *)p_srcs[5];
-  const float *src_22 = (const float *)p_srcs[8];
-
-  float *dst_00 = (float *)p_dsts[0];
-  float *dst_01 = (float *)p_dsts[1];
-  float *dst_02 = (float *)p_dsts[2];
-  float *dst_10 = (float *)p_dsts[3];
-  float *dst_11 = (float *)p_dsts[4];
-  float *dst_12 = (float *)p_dsts[5];
-  float *dst_20 = (float *)p_dsts[6];
-  float *dst_21 = (float *)p_dsts[7];
-  float *dst_22 = (float *)p_dsts[8];
+  const float32_t *src_00 = (const float32_t *)p_srcs[0];
+  const float32_t *src_01 = (const float32_t *)p_srcs[1];
+  const float32_t *src_02 = (const float32_t *)p_srcs[2];
+  const float32_t *src_11 = (const float32_t *)p_srcs[4];
+  const float32_t *src_12 = (const float32_t *)p_srcs[5];
+  const float32_t *src_22 = (const float32_t *)p_srcs[8];
+
+  float32_t *dst_00 = (float32_t *)p_dsts[0];
+  float32_t *dst_01 = (float32_t *)p_dsts[1];
+  float32_t *dst_02 = (float32_t *)p_dsts[2];
+  float32_t *dst_10 = (float32_t *)p_dsts[3];
+  float32_t *dst_11 = (float32_t *)p_dsts[4];
+  float32_t *dst_12 = (float32_t *)p_dsts[5];
+  float32_t *dst_20 = (float32_t *)p_dsts[6];
+  float32_t *dst_21 = (float32_t *)p_dsts[7];
+  float32_t *dst_22 = (float32_t *)p_dsts[8];
 
 #if ARMRAL_ARCH_SVE >= 2
   for (uint32_t mat_i = 0; mat_i < num_mats; mat_i += 4) {
@@ -1357,7 +1357,7 @@ static void sve_invert_hermitian_matrix4x4(
 
   svfloat32_t z = svdup_n_f32(0);
 
-  // Enable the compiler to optimise away loading c0 and c1.
+  // Enable the compiler to optimize away loading c0 and c1.
   sve_mat_conj_tran_2x2(b0, b1, &c0, &c1);
 
   svfloat32_t a_inv0;
@@ -1407,7 +1407,7 @@ void invert_hermitian_matrix<4>(const armral_cmplx_f32_t *__restrict p_src,
 #ifdef ARMRAL_ARCH_SVE
   svbool_t p4 = svptrue_pat_b32(SV_VL4);
 
-  const float *src = (const float *)p_src;
+  const float32_t *src = (const float32_t *)p_src;
   svfloat32_t a0 = svld1_f32(p4, &src[0 * 4]);
   svfloat32_t b0 = svld1_f32(p4, &src[1 * 4]);
   svfloat32_t a1 = svld1_f32(p4, &src[2 * 4]);
@@ -1450,7 +1450,7 @@ void invert_hermitian_matrix<4>(const armral_cmplx_f32_t *__restrict p_src,
   float32x4x2_t c;
   float32x4x2_t d;
 
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
 
   // Fill sub-blocks matrix
   a.val[FIRST_ROW] = vld1q_f32(p_mat);
@@ -1857,8 +1857,8 @@ invert_batch_hermitian_matrix_4x4(uint32_t num_mats,
                                   const armral_cmplx_f32_t *__restrict p_src,
                                   armral_cmplx_f32_t *p_dst) {
 
-  const float *src = (const float *)p_src;
-  float *dst = (float *)p_dst;
+  const float32_t *src = (const float32_t *)p_src;
+  float32_t *dst = (float32_t *)p_dst;
   uint32_t stride = num_mats * 2;
 
 #if ARMRAL_ARCH_SVE >= 2
@@ -2097,39 +2097,39 @@ invert_batch_hermitian_matrix_4x4_pa(
     uint32_t num_mats, const armral_cmplx_f32_t *__restrict *__restrict p_srcs,
     armral_cmplx_f32_t *__restrict *__restrict p_dsts) {
 
-  const float *src_00 = (const float *)p_srcs[0];
-  const float *src_01 = (const float *)p_srcs[1];
-  const float *src_02 = (const float *)p_srcs[2];
-  const float *src_03 = (const float *)p_srcs[3];
-  const float *src_10 = (const float *)p_srcs[4];
-  const float *src_11 = (const float *)p_srcs[5];
-  const float *src_12 = (const float *)p_srcs[6];
-  const float *src_13 = (const float *)p_srcs[7];
-  const float *src_20 = (const float *)p_srcs[8];
-  const float *src_21 = (const float *)p_srcs[9];
-  const float *src_22 = (const float *)p_srcs[10];
-  const float *src_23 = (const float *)p_srcs[11];
-  const float *src_30 = (const float *)p_srcs[12];
-  const float *src_31 = (const float *)p_srcs[13];
-  const float *src_32 = (const float *)p_srcs[14];
-  const float *src_33 = (const float *)p_srcs[15];
-
-  float *dst_00 = (float *)p_dsts[0];
-  float *dst_01 = (float *)p_dsts[1];
-  float *dst_02 = (float *)p_dsts[2];
-  float *dst_03 = (float *)p_dsts[3];
-  float *dst_10 = (float *)p_dsts[4];
-  float *dst_11 = (float *)p_dsts[5];
-  float *dst_12 = (float *)p_dsts[6];
-  float *dst_13 = (float *)p_dsts[7];
-  float *dst_20 = (float *)p_dsts[8];
-  float *dst_21 = (float *)p_dsts[9];
-  float *dst_22 = (float *)p_dsts[10];
-  float *dst_23 = (float *)p_dsts[11];
-  float *dst_30 = (float *)p_dsts[12];
-  float *dst_31 = (float *)p_dsts[13];
-  float *dst_32 = (float *)p_dsts[14];
-  float *dst_33 = (float *)p_dsts[15];
+  const float32_t *src_00 = (const float32_t *)p_srcs[0];
+  const float32_t *src_01 = (const float32_t *)p_srcs[1];
+  const float32_t *src_02 = (const float32_t *)p_srcs[2];
+  const float32_t *src_03 = (const float32_t *)p_srcs[3];
+  const float32_t *src_10 = (const float32_t *)p_srcs[4];
+  const float32_t *src_11 = (const float32_t *)p_srcs[5];
+  const float32_t *src_12 = (const float32_t *)p_srcs[6];
+  const float32_t *src_13 = (const float32_t *)p_srcs[7];
+  const float32_t *src_20 = (const float32_t *)p_srcs[8];
+  const float32_t *src_21 = (const float32_t *)p_srcs[9];
+  const float32_t *src_22 = (const float32_t *)p_srcs[10];
+  const float32_t *src_23 = (const float32_t *)p_srcs[11];
+  const float32_t *src_30 = (const float32_t *)p_srcs[12];
+  const float32_t *src_31 = (const float32_t *)p_srcs[13];
+  const float32_t *src_32 = (const float32_t *)p_srcs[14];
+  const float32_t *src_33 = (const float32_t *)p_srcs[15];
+
+  float32_t *dst_00 = (float32_t *)p_dsts[0];
+  float32_t *dst_01 = (float32_t *)p_dsts[1];
+  float32_t *dst_02 = (float32_t *)p_dsts[2];
+  float32_t *dst_03 = (float32_t *)p_dsts[3];
+  float32_t *dst_10 = (float32_t *)p_dsts[4];
+  float32_t *dst_11 = (float32_t *)p_dsts[5];
+  float32_t *dst_12 = (float32_t *)p_dsts[6];
+  float32_t *dst_13 = (float32_t *)p_dsts[7];
+  float32_t *dst_20 = (float32_t *)p_dsts[8];
+  float32_t *dst_21 = (float32_t *)p_dsts[9];
+  float32_t *dst_22 = (float32_t *)p_dsts[10];
+  float32_t *dst_23 = (float32_t *)p_dsts[11];
+  float32_t *dst_30 = (float32_t *)p_dsts[12];
+  float32_t *dst_31 = (float32_t *)p_dsts[13];
+  float32_t *dst_32 = (float32_t *)p_dsts[14];
+  float32_t *dst_33 = (float32_t *)p_dsts[15];
 
 #if ARMRAL_ARCH_SVE >= 2
   for (uint32_t mat_i = 0; mat_i < num_mats; mat_i += 4) {
@@ -2523,7 +2523,7 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
   store_quadrant<8>(d_out, 1, 1, p_dst);
 #else
 
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
   float32x4_t mat_a[8];
   float32x4_t mat_b[8];
   float32x4_t mat_c[8];
@@ -2553,9 +2553,9 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
     p_mat += 4;
   }
 
-  float *p_mat_a = (float *)mat_a;
-  float *p_mat_b = (float *)mat_b;
-  float *p_mat_c = (float *)mat_c;
+  float32_t *p_mat_a = (float32_t *)mat_a;
+  float32_t *p_mat_b = (float32_t *)mat_b;
+  float32_t *p_mat_c = (float32_t *)mat_c;
 
   /*Calculate inverse sublock A */
   float32x4_t inv_a[8];
@@ -2618,7 +2618,7 @@ void invert_hermitian_matrix<8>(const armral_cmplx_f32_t *__restrict p_src,
     block11[i] = vaddq_f32(inv_a[i], temp_mat2[i]);
   }
 
-  float *p_inv = (float *)p_dst;
+  float32_t *p_inv = (float32_t *)p_dst;
 
   vst1q_f32(p_inv, block11[0]);
   p_inv += 4;
@@ -2743,7 +2743,7 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
   store_quadrant<16>(c_out, 1, 0, p_dst);
   store_quadrant<16>(d_out, 1, 1, p_dst);
 #else
-  const float *p_mat = (const float *)p_src;
+  const float32_t *p_mat = (const float32_t *)p_src;
   float32x4_t mat_a[32];
   float32x4_t mat_b[32];
   float32x4_t mat_c[32];
@@ -2789,9 +2789,9 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
     p_mat += 4;
   }
 
-  float *p_mat_a = (float *)mat_a;
-  float *p_mat_b = (float *)mat_b;
-  float *p_mat_c = (float *)mat_c;
+  float32_t *p_mat_a = (float32_t *)mat_a;
+  float32_t *p_mat_b = (float32_t *)mat_b;
+  float32_t *p_mat_c = (float32_t *)mat_c;
 
   /*Calculate inverse sublock A */
   float32x4_t inv_a[32];
@@ -2864,7 +2864,7 @@ void invert_hermitian_matrix<16>(const armral_cmplx_f32_t *__restrict p_src,
     block11[2 * i + 1] = vaddq_f32(inv_a[2 * i + 1], temp_mat2[2 * i + 1]);
   }
 
-  float *p_inv = (float *)p_dst;
+  float32_t *p_inv = (float32_t *)p_dst;
 
   vst1q_f32(p_inv, block11[2 * 0 + 0]);
   p_inv += 4;
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
index fccc80a..7105e2b 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -668,7 +668,7 @@ static void invert_batch_matrix_2x2_impl(
   // det = a * d - b * c
   armral_cmplx_f32_t det = scal_minor_cmplx_f32(a, b, c, d);
   // inv_det = 1 / det = conj(det) / abs(det)^2
-  float det_abs2_inv = 1.0F / scal_mod2_cmplx_f32(det).re;
+  float32_t det_abs2_inv = 1.0F / scal_mod2_cmplx_f32(det).re;
   armral_cmplx_f32_t inv_det = {det.re * det_abs2_inv, -det.im * det_abs2_inv};
   armral_cmplx_f32_t minus_inv_det = {-inv_det.re, -inv_det.im};
   // p_dst = | d, -b | * inv_det
diff --git a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
index cb47a3e..49eebad 100644
--- a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
+++ b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace armral::cmplx_herm_mat_inv {
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
index 29d1683..339080c 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
index ddad389..15d40f5 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -19,6 +19,38 @@ static void cmplx_mat_mult_ahb_b2x2(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 2;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p4 = svptrue_pat_b32(SV_VL4);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+  svfloat32_t b_r0 = svld1_f32(p4, (float32_t const *)p_src_b);
+  svfloat32_t b_r1 = svld1_f32(p4, ((float32_t const *)p_src_b) + 4);
+  svfloat32_t b_r0_rev = svrev64_f32(p4, b_r0);
+  svfloat32_t b_r1_rev = svrev64_f32(p4, b_r1);
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t acc_1;
+    svfloat32_t acc_2;
+
+    // r = 0
+    svfloat32_t a_r0j = svld1_f32(p2, (float32_t const *)&p_src_a[j]);
+    // [R0*r, R0*i, R0*r, R0*i]
+    acc_1 = svmul_lane_f32(b_r0, a_r0j, 0);
+    // [I0*i, I0*r, I0*i, I0*r]
+    acc_2 = svmul_lane_f32(b_r0_rev, a_r0j, 1);
+
+    // r = 1
+    svfloat32_t a_r1j = svld1_f32(p2, (float32_t const *)&p_src_a[n + j]);
+    // [R0*r, R0*i, R0*r, R0*i]
+    acc_1 = svmla_lane_f32(acc_1, b_r1, a_r1j, 0);
+    // [I0*i, I0*r, I0*i, I0*r]
+    acc_2 = svmla_lane_f32(acc_2, b_r1_rev, a_r1j, 1);
+
+    svfloat32_t result = svadd_f32_x(
+        p4, acc_1,
+        svreinterpret_f32_f64(svneg_f64_x(p4, svreinterpret_f64_f32(acc_2))));
+    svst1_f32(p4, (float32_t *)&p_dst[mk * j], result);
+  }
+#else
   float32x4_t b_r0 = vld1q_f32((float32_t const *)p_src_b);
   float32x4_t b_r1 = vld1q_f32(((float32_t const *)p_src_b) + 4);
   float32x4_t b_r0_rev = vrev64q_f32(b_r0);
@@ -46,6 +78,7 @@ static void cmplx_mat_mult_ahb_b2x2(uint16_t n,
 
     vst1q_f32((float32_t *)&p_dst[mk * j], result);
   }
+#endif
 }
 
 static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
@@ -54,6 +87,42 @@ static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 3;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p3 = svptrue_pat_b32(SV_VL3);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+
+  svfloat32x2_t b_r0 = svld2_f32(p3, (float32_t const *)&p_src_b[0]);
+  svfloat32x2_t b_r1 = svld2_f32(p3, (float32_t const *)&p_src_b[3]);
+  svfloat32x2_t b_r2 = svld2_f32(p3, (float32_t const *)&p_src_b[6]);
+  svfloat32x2_t *b_rows[] = {&b_r0, &b_r1, &b_r2};
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t dot_0 = svundef_f32();
+    svfloat32_t dot_1 = svundef_f32();
+    // Note: We leave it to the compiler to unroll this loop over mk
+    for (uint32_t r = 0; r < mk; r++) {
+      svfloat32_t a_rj = svld1_f32(p2, (float32_t const *)&p_src_a[r * n + j]);
+      // Note: We leave it to the compiler to eliminate the following branch
+      if (r == 0) {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmul_lane_f32(svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmul_lane_f32(svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      } else {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmla_lane_f32(dot_1, svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      }
+    }
+    svfloat32x2_t dot = svcreate2(dot_0, dot_1);
+    svst2_f32(p3, (float32_t *)&p_dst[mk * j], dot);
+  }
+#else
   // Copy the final row of B so we can safely read one extra column:
   armral_cmplx_f32_t final_row[4];
   memcpy(final_row, &p_src_b[6], sizeof(armral_cmplx_f32_t) * 3);
@@ -100,6 +169,7 @@ static void cmplx_mat_mult_ahb_b3x3(uint16_t n,
     // Store the remaining column:
     vst1_f32(((float32_t *)&p_dst[mk * j]) + 4, vget_low_f32(result.val[1]));
   }
+#endif
 }
 
 static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
@@ -108,6 +178,44 @@ static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
                                     armral_cmplx_f32_t *p_dst) {
   const uint16_t mk = 4;
 
+#if ARMRAL_ARCH_SVE >= 2
+  svbool_t p4 = svptrue_pat_b32(SV_VL4);
+  svbool_t p2 = svptrue_pat_b32(SV_VL2);
+
+  svfloat32x2_t b_r0 = svld2_f32(p4, (float32_t const *)&p_src_b[0]);
+  svfloat32x2_t b_r1 = svld2_f32(p4, (float32_t const *)&p_src_b[4]);
+  svfloat32x2_t b_r2 = svld2_f32(p4, (float32_t const *)&p_src_b[8]);
+  svfloat32x2_t b_r3 = svld2_f32(p4, (float32_t const *)&p_src_b[12]);
+  svfloat32x2_t *b_rows[] = {&b_r0, &b_r1, &b_r2, &b_r3};
+
+  for (uint32_t j = 0; j < n; j++) {
+    svfloat32_t dot_0 = svundef_f32();
+    svfloat32_t dot_1 = svundef_f32();
+    // Note: We leave it to the compiler to unroll this loop over mk
+    for (uint32_t r = 0; r < mk; r++) {
+      svfloat32_t a_rj = svld1_f32(p2, (float32_t const *)&p_src_a[r * n + j]);
+      // Note: We leave it to the compiler to eliminate the following branch
+      if (r == 0) {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmul_lane_f32(svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmul_lane_f32(svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      } else {
+        // dot.re += a_jr.re * b_ir.re + a_jr.im * b_ir.im;
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 0), a_rj, 0);
+        dot_0 = svmla_lane_f32(dot_0, svget2(*b_rows[r], 1), a_rj, 1);
+        // dot.im += a_jr.re * b_ir.im - a_jr.im * b_ir.re;
+        dot_1 = svmla_lane_f32(dot_1, svget2(*b_rows[r], 1), a_rj, 0);
+        dot_1 = svmls_lane_f32(dot_1, svget2(*b_rows[r], 0), a_rj, 1);
+      }
+    }
+
+    svfloat32x2_t dot = svcreate2(dot_0, dot_1);
+    svst2_f32(p4, (float32_t *)&p_dst[mk * j], dot);
+  }
+#else
   float32x4x2_t b_rows[4] = {vld2q_f32((float32_t const *)&p_src_b[0]),
                              vld2q_f32((float32_t const *)&p_src_b[4]),
                              vld2q_f32((float32_t const *)&p_src_b[8]),
@@ -137,6 +245,7 @@ static void cmplx_mat_mult_ahb_b4x4(uint16_t n,
     }
     vst2q_f32((float32_t *)&p_dst[mk * j], dot);
   }
+#endif
 }
 
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
index 4712548..b24f257 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -642,8 +642,8 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
   }
 #else
 
-  const float *p_in1 = (const float *)p_src_a;
-  const float *p_in2 = (const float *)p_src_b;
+  const float32_t *p_in1 = (const float32_t *)p_src_a;
+  const float32_t *p_in2 = (const float32_t *)p_src_b;
   const armral_cmplx_f32_t *p_in_a = p_src_a;
   armral_cmplx_f32_t *p_out = p_dst;
   armral_cmplx_f32_t *px;
@@ -664,8 +664,8 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
   float32x4_t b_col_real2;
   float32x4_t b_col_im2;
   float32x2_t accum = vdup_n_f32(0);
-  const float *p_in1_b = (const float *)p_src_a;
-  const float *p_in1_b2 = (const float *)p_src_b;
+  const float32_t *p_in1_b = (const float32_t *)p_src_a;
+  const float32_t *p_in1_b2 = (const float32_t *)p_src_b;
 
   uint16_t col;
   uint16_t i = 0U;
@@ -690,7 +690,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
     /* For every row wise process, the pIn2 pointer is set
      ** to the starting address of the pSrcB data */
-    p_in2 = (const float *)p_src_b;
+    p_in2 = (const float32_t *)p_src_b;
     p_in1_b2 = p_in2 + 2 * num_cols_b;
 
     j = 0U;
@@ -721,7 +721,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_b = p_in1 + 2 * num_cols_a;
 
       float32x4_t acc_r0 = {};
@@ -883,7 +883,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
       // /* Update the pointer pIn2 to point to the  starting address of the
       // next column */
       j++;
-      p_in2 = (const float *)p_src_b + 4U * j;
+      p_in2 = (const float32_t *)p_src_b + 4U * j;
       p_in1_b2 = p_in2 + 2U * num_cols_b;
       col--;
     }
@@ -902,7 +902,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_b = p_in1 + 2 * num_cols_a;
 
       float32x4_t acc_r0 = {};
@@ -1043,7 +1043,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
     /* For every row wise process, the pIn2 pointer is set
      ** to the starting address of the pSrcB data */
-    p_in2 = (const float *)p_src_b;
+    p_in2 = (const float32_t *)p_src_b;
 
     j = 0U;
 
@@ -1058,7 +1058,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
 
       /* Initiate the pointer pIn1 to point to the starting address of the
        * column being processed */
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
 
       float32x4_t acc_r0 = {};
       float32x4_t acc_i0 = {};
@@ -1144,7 +1144,7 @@ armral_cmplx_mat_mult_f32(const uint16_t m, const uint16_t n, const uint16_t k,
       /* Update the pointer pIn2 to point to the  starting address of the next
        * column */
       j++;
-      p_in2 = (const float *)p_src_b + 2U * j;
+      p_in2 = (const float32_t *)p_src_b + 2U * j;
 
       /* Decrement the column loop counter */
       col--;
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
index f1b3341..8aa33c5 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
index fdfe709..54b182e 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
index 631c0b1..83f9ec1 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -15,8 +15,8 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
                               const armral_cmplx_f32_t *restrict p_src_a,
                               const armral_cmplx_f32_t *restrict p_src_x,
                               armral_cmplx_f32_t *p_dst) {
-  const float *p_in1 = (const float *)p_src_a;
-  const float *p_in2 = (const float *)p_src_x;
+  const float32_t *p_in1 = (const float32_t *)p_src_a;
+  const float32_t *p_in2 = (const float32_t *)p_src_x;
   const armral_cmplx_f32_t *p_in_a = p_src_a;
   armral_cmplx_f32_t *p_out = p_dst;
   uint16_t num_rows_a = m; // number of rows of input matrix A
@@ -25,18 +25,18 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
 #ifdef ARMRAL_ARCH_SVE
   svbool_t ptrue = svptrue_b32();
   if (num_rows_a % 2 == 0) {
-    const float *p_in1_2 = (const float *)p_src_a;
+    const float32_t *p_in1_2 = (const float32_t *)p_src_a;
     // Loop over A two rows at a time
     for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
          row_cnt -= 2, p_out += 2, p_in_a += 2 * num_cols_a) {
-      // Initialise p_in1 and p_in1_2 to point to the starting addresses of the
+      // Initialize p_in1 and p_in1_2 to point to the starting addresses of the
       // current rows
-      p_in1 = (const float *)p_in_a;
+      p_in1 = (const float32_t *)p_in_a;
       p_in1_2 = p_in1 + 2 * num_cols_a;
 
       // For every row wise process, the pIn2 pointer is set
       // to the starting address of the pSrcX data
-      p_in2 = (const float *)p_src_x;
+      p_in2 = (const float32_t *)p_src_x;
 
       // Initialize accumulators
       svfloat32_t sum = svdup_n_f32(0);
@@ -107,12 +107,12 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
     for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
          --row_cnt, p_out++, p_in_a += num_cols_a) {
 
-      // Initialise p_in1 to point to the starting address of the current row
-      p_in1 = (const float *)p_in_a;
+      // Initialize p_in1 to point to the starting address of the current row
+      p_in1 = (const float32_t *)p_in_a;
 
       // For every row wise process, the pIn2 pointer is set
       // to the starting address of the pSrcX data
-      p_in2 = (const float *)p_src_x;
+      p_in2 = (const float32_t *)p_src_x;
 
       // Initialize accumulators
       svfloat32_t sum = svdup_n_f32(0);
@@ -167,12 +167,12 @@ armral_cmplx_mat_vec_mult_f32(const uint16_t m, const uint16_t n,
   for (uint16_t row_cnt = num_rows_a; row_cnt > 0;
        --row_cnt, ++p_out, p_in_a += num_cols_a) {
 
-    // Initialise p_in1 to point to the starting address of the current row
-    p_in1 = (const float *)p_in_a;
+    // Initialize p_in1 to point to the starting address of the current row
+    p_in1 = (const float32_t *)p_in_a;
 
     // For every row wise process, the pIn2 pointer is set
     // to the starting address of the pSrcX data
-    p_in2 = (const float *)p_src_x;
+    p_in2 = (const float32_t *)p_src_x;
 
     // Initialize accumulators
     float32_t acc_re = 0.0;
@@ -255,8 +255,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
       float32x2_t acc_3;
       float32x2_t acc_4;
       {
-        float32x4_t a_vec = vld1q_f32((float const *)current_a);
-        float32x2x2_t x_vec = vld1_f32_x2((float const *)current_x);
+        float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+        float32x2x2_t x_vec = vld1_f32_x2((float32_t const *)current_x);
         acc_1 = vmul_laneq_f32(x_vec.val[0], a_vec, 0);
         acc_2 = vmul_laneq_f32(x_vec.val[0], a_vec, 1);
         acc_3 = vmul_laneq_f32(x_vec.val[1], a_vec, 2);
@@ -265,8 +265,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
         current_a += num_mats;
       }
       for (uint32_t col = 1; col < n; col++) {
-        float32x4_t a_vec = vld1q_f32((float const *)current_a);
-        float32x2x2_t x_vec = vld1_f32_x2((float const *)current_x);
+        float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+        float32x2x2_t x_vec = vld1_f32_x2((float32_t const *)current_x);
         acc_1 = vfma_laneq_f32(acc_1, x_vec.val[0], a_vec, 0);
         acc_2 = vfma_laneq_f32(acc_2, x_vec.val[0], a_vec, 1);
         acc_3 = vfma_laneq_f32(acc_3, x_vec.val[1], a_vec, 2);
@@ -279,8 +279,8 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
           vadd_f32(acc_1, vneg64_f32(vneg_f32(vrev64_f32(acc_2))));
       float32x2_t result_2 =
           vadd_f32(acc_3, vneg64_f32(vneg_f32(vrev64_f32(acc_4))));
-      vst1_f32((float *)(out_ptr + 0), result_1);
-      vst1_f32((float *)(out_ptr + 1), result_2);
+      vst1_f32((float32_t *)(out_ptr + 0), result_1);
+      vst1_f32((float32_t *)(out_ptr + 1), result_2);
 
       a_current_row_start += num_mats * n;
       out_ptr += num_mats;
@@ -328,9 +328,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x4_t acc_3;
         float32x4_t acc_4;
         {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x4_t x1_vec = vld1q_f32((float const *)current_x_1);
-          float32x4_t x2_vec = vld1q_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x4_t x1_vec = vld1q_f32((float32_t const *)current_x_1);
+          float32x4_t x2_vec = vld1q_f32((float32_t const *)current_x_2);
           acc_1 = vmulq_laneq_f32(x1_vec, a_vec, 0);
           acc_2 = vmulq_laneq_f32(x1_vec, a_vec, 1);
           acc_3 = vmulq_laneq_f32(x2_vec, a_vec, 2);
@@ -340,9 +340,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
           current_a += num_mats;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x4_t x1_vec = vld1q_f32((float const *)current_x_1);
-          float32x4_t x2_vec = vld1q_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x4_t x1_vec = vld1q_f32((float32_t const *)current_x_1);
+          float32x4_t x2_vec = vld1q_f32((float32_t const *)current_x_2);
           acc_1 = vfmaq_laneq_f32(acc_1, x1_vec, a_vec, 0);
           acc_2 = vfmaq_laneq_f32(acc_2, x1_vec, a_vec, 1);
           acc_3 = vfmaq_laneq_f32(acc_3, x2_vec, a_vec, 2);
@@ -356,8 +356,8 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
             vaddq_f32(acc_1, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_2))));
         float32x4_t result_2 =
             vaddq_f32(acc_3, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_4))));
-        vst1q_f32((float *)out_ptr_1, result_1);
-        vst1q_f32((float *)out_ptr_2, result_2);
+        vst1q_f32((float32_t *)out_ptr_1, result_1);
+        vst1q_f32((float32_t *)out_ptr_2, result_2);
 
         a_current_row_start += num_mats * n;
         out_ptr_1 += vec_stride;
@@ -385,9 +385,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x2_t acc_3;
         float32x2_t acc_4;
         {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x2_t x1_vec = vld1_f32((float const *)current_x_1);
-          float32x2_t x2_vec = vld1_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x2_t x1_vec = vld1_f32((float32_t const *)current_x_1);
+          float32x2_t x2_vec = vld1_f32((float32_t const *)current_x_2);
           acc_1 = vmul_laneq_f32(x1_vec, a_vec, 0);
           acc_2 = vmul_laneq_f32(x1_vec, a_vec, 1);
           acc_3 = vmul_laneq_f32(x2_vec, a_vec, 2);
@@ -397,9 +397,9 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
           current_a += num_mats;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x4_t a_vec = vld1q_f32((float const *)current_a);
-          float32x2_t x1_vec = vld1_f32((float const *)current_x_1);
-          float32x2_t x2_vec = vld1_f32((float const *)current_x_2);
+          float32x4_t a_vec = vld1q_f32((float32_t const *)current_a);
+          float32x2_t x1_vec = vld1_f32((float32_t const *)current_x_1);
+          float32x2_t x2_vec = vld1_f32((float32_t const *)current_x_2);
           acc_1 = vfma_laneq_f32(acc_1, x1_vec, a_vec, 0);
           acc_2 = vfma_laneq_f32(acc_2, x1_vec, a_vec, 1);
           acc_3 = vfma_laneq_f32(acc_3, x2_vec, a_vec, 2);
@@ -413,8 +413,8 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
             vadd_f32(acc_1, vneg64_f32(vneg_f32(vrev64_f32(acc_2))));
         float32x2_t result_2 =
             vadd_f32(acc_3, vneg64_f32(vneg_f32(vrev64_f32(acc_4))));
-        vst1_f32((float *)out_ptr_1, result_1);
-        vst1_f32((float *)out_ptr_2, result_2);
+        vst1_f32((float32_t *)out_ptr_1, result_1);
+        vst1_f32((float32_t *)out_ptr_2, result_2);
 
         a_current_row_start += num_mats * n;
         out_ptr_1 += vec_stride;
@@ -438,16 +438,16 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
         float32x4_t acc_1;
         float32x4_t acc_2;
         {
-          float32x2_t a_vec = vld1_f32((float const *)current_a);
-          float32x4_t x_vec = vld1q_f32((float const *)current_x);
+          float32x2_t a_vec = vld1_f32((float32_t const *)current_a);
+          float32x4_t x_vec = vld1q_f32((float32_t const *)current_x);
           acc_1 = vmulq_lane_f32(x_vec, a_vec, 0);
           acc_2 = vmulq_lane_f32(x_vec, a_vec, 1);
           current_a += num_mats;
           current_x += vec_stride;
         }
         for (uint32_t col = 1; col < n; col++) {
-          float32x2_t a_vec = vld1_f32((float const *)current_a);
-          float32x4_t x_vec = vld1q_f32((float const *)current_x);
+          float32x2_t a_vec = vld1_f32((float32_t const *)current_a);
+          float32x4_t x_vec = vld1q_f32((float32_t const *)current_x);
           acc_1 = vfmaq_lane_f32(acc_1, x_vec, a_vec, 0);
           acc_2 = vfmaq_lane_f32(acc_2, x_vec, a_vec, 1);
           current_a += num_mats;
@@ -456,7 +456,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32(
 
         float32x4_t result =
             vaddq_f32(acc_1, vnegq64_f32(vnegq_f32(vrev64q_f32(acc_2))));
-        vst1q_f32((float *)out_ptr, result);
+        vst1q_f32((float32_t *)out_ptr, result);
 
         out_ptr += vec_stride;
         a_current_row_start += num_mats * n;
@@ -488,18 +488,18 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32_pa(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
         // Set the accumulator variables to zero
-        float sum_real1 = 0;
-        float sum_imag1 = 0;
-        float sum_real2 = 0;
-        float sum_imag2 = 0;
+        float32_t sum_real1 = 0;
+        float32_t sum_imag1 = 0;
+        float32_t sum_real2 = 0;
+        float32_t sum_imag2 = 0;
 
         // Loop over the row of A one column at a time
         for (uint16_t col_cnt = 0; col_cnt < n; ++col_cnt) {
           int i = (row_cnt * n) + col_cnt;
-          float a_re = p_srcs_a[i][mat_idx].re;
-          float a_im = p_srcs_a[i][mat_idx].im;
-          float x_re = p_srcs_x[col_cnt][vec_idx].re;
-          float x_im = p_srcs_x[col_cnt][vec_idx].im;
+          float32_t a_re = p_srcs_a[i][mat_idx].re;
+          float32_t a_im = p_srcs_a[i][mat_idx].im;
+          float32_t x_re = p_srcs_x[col_cnt][vec_idx].re;
+          float32_t x_im = p_srcs_x[col_cnt][vec_idx].im;
           sum_real1 += a_re * x_re;
           sum_imag1 += a_im * x_re;
           sum_real2 -= a_im * x_im;
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
index 1f2ba56..325b206 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -47,7 +47,7 @@ armral_cmplx_mat_vec_mult_i16(const uint16_t m, const uint16_t n,
     int64_t sum_real1_b_ext = 0;
     int64_t sum_imag1_b_ext = 0;
 
-    // Initialise the pointer pIn1 to point to the starting address of the
+    // Initialize the pointer pIn1 to point to the starting address of the
     // column being processed
     p_in1 = (const int16_t *)p_in_a;
     p_in1_b = p_in1 + 2 * num_cols_a;
@@ -259,7 +259,7 @@ armral_cmplx_mat_vec_mult_i16(const uint16_t m, const uint16_t n,
     int64_t sum_real2_ext = 0;
     int64_t sum_imag2_ext = 0;
 
-    // Initialise the pointer pIn1 to point to the starting address of the row
+    // Initialize the pointer pIn1 to point to the starting address of the row
     // being processed
     p_in1 = (const int16_t *)p_in_a;
 
@@ -444,7 +444,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = m; row_cnt > 0;
            --row_cnt, p_out += vec_stride, p_in_a += num_mats * n) {
 
-        // Initialise pIn1 to point to the starting address of the current row
+        // Initialize pIn1 to point to the starting address of the current row
         const int16_t *p_in1 = (const int16_t *)p_in_a;
 
         // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -553,7 +553,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = m; row_cnt > 0;
            --row_cnt, p_out += vec_stride, p_in_a += num_mats * n) {
 
-        // Initialise pIn1 to point to the starting address of the current row
+        // Initialize pIn1 to point to the starting address of the current row
         const int16_t *p_in1 = (const int16_t *)p_in_a;
 
         // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -654,7 +654,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, p_out += num_mats, p_in_a += num_mats * n) {
 
-      // Initialise pIn1 to point to the starting address of the current row
+      // Initialize pIn1 to point to the starting address of the current row
       const int16_t *p_in1 = (const int16_t *)p_in_a;
 
       // For every row, pIn2 is set to the starting address of the pSrcX data
@@ -734,7 +734,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, p_out += num_mats, p_in_a += num_mats * n) {
 
-      // Initialise pIn1 to point to the starting address of the current row
+      // Initialize pIn1 to point to the starting address of the current row
       const int16_t *p_in1 = (const int16_t *)p_in_a;
 
       // For every row, pIn2 is set to the starting address of the pSrcX data
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
index 5f0c082..f7fa1d5 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -87,7 +87,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, a_ptr += num_mats * n, out_ptr += num_mats) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       int32x4_t sum_real_lo;
       int32x4_t sum_real_hi;
       int32x4_t sum_imag_lo;
@@ -166,7 +166,7 @@ cmplx_mat_vec_mult_batch_one_vec(uint16_t num_mats, uint16_t m, uint16_t n,
     for (uint16_t row_cnt = 0; row_cnt < m;
          ++row_cnt, a_ptr += num_mats * n, out_ptr += num_mats) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       svint32_t sum_real;
       svint32_t sum_imag;
 
@@ -238,7 +238,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = 0; row_cnt < m;
            ++row_cnt, a_ptr += num_mats * n, out_ptr += vec_stride) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         int32x4_t sum_real_lo;
         int32x4_t sum_real_hi;
         int32x4_t sum_imag_lo;
@@ -314,7 +314,7 @@ static armral_status cmplx_mat_vec_mult_batch_unroll_vec(
       for (uint16_t row_cnt = 0; row_cnt < m;
            ++row_cnt, a_ptr += num_mats * n, out_ptr += vec_stride) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         svint32_t sum_real;
         svint32_t sum_imag;
 
@@ -391,7 +391,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_one_vec(
     // Loop over A one row at a time
     for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       int32x4_t sum_real;
       int32x4_t sum_imag;
 
@@ -446,7 +446,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_one_vec(
     // Loop over A one row at a time
     for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-      // Initialise the accumulator
+      // Initialize the accumulator
       svint32_t sum_real;
       svint32_t sum_imag;
 
@@ -506,7 +506,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_unroll_vec(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         int32x4_t sum_real_lo;
         int32x4_t sum_real_hi;
         int32x4_t sum_imag_lo;
@@ -570,7 +570,7 @@ static armral_status cmplx_mat_vec_mult_batch_pa_unroll_vec(
       // Loop over A one row at a time
       for (uint16_t row_cnt = 0; row_cnt < m; ++row_cnt) {
 
-        // Initialise the accumulator
+        // Initialize the accumulator
         svint32_t sum_real;
         svint32_t sum_imag;
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
index 6717b10..18124b5 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_convert.h"
@@ -16,13 +16,14 @@ armral_status armral_solve_2x2_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const uint32_t p_xstride, const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[2] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1]};
+  float32_t y_shifts[2] = {1 << p_y_num_fract_bits[0],
+                           1 << p_y_num_fract_bits[1]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
@@ -170,14 +171,15 @@ armral_status armral_solve_2x4_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const uint32_t p_xstride, const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1],
-                       1 << p_y_num_fract_bits[2], 1 << p_y_num_fract_bits[3]};
+  float32_t y_shifts[4] = {
+      1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1],
+      1 << p_y_num_fract_bits[2], 1 << p_y_num_fract_bits[3]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
@@ -356,13 +358,13 @@ armral_status armral_solve_4x4_1sc_f32(
     uint32_t p_gstride, armral_cmplx_int16_t *p_x, uint32_t p_xstride,
     armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {
+  float32_t y_shifts[4] = {
       1U << (16 + p_y_num_fract_bits[0]), 1U << (16 + p_y_num_fract_bits[1]),
       1U << (16 + p_y_num_fract_bits[2]), 1U << (16 + p_y_num_fract_bits[3])};
   svbool_t pg16;
@@ -657,13 +659,13 @@ armral_status armral_solve_1x4_1sc_f32(
     const uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     const armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[4] = {
+  float32_t y_shifts[4] = {
       1U << (16 + p_y_num_fract_bits[0]), 1U << (16 + p_y_num_fract_bits[1]),
       1U << (16 + p_y_num_fract_bits[2]), 1U << (16 + p_y_num_fract_bits[3])};
   svbool_t pg16;
@@ -797,13 +799,14 @@ armral_status armral_solve_1x2_1sc_f32(
     uint32_t p_gstride, armral_cmplx_int16_t *p_x,
     armral_fixed_point_index num_fract_bits_x) {
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
 
   int16_t *p_x_int16 = (int16_t *)p_x;
   const int16_t *p_y_int16 = (const int16_t *)p_y;
 
 #if ARMRAL_ARCH_SVE >= 2
-  float y_shifts[2] = {1 << p_y_num_fract_bits[0], 1 << p_y_num_fract_bits[1]};
+  float32_t y_shifts[2] = {1 << p_y_num_fract_bits[0],
+                           1 << p_y_num_fract_bits[1]};
   svbool_t pg32;
 
   for (uint32_t i = 0;
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
index b0f9c8a..c542d2e 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
index 56bd0d8..e9a43c2 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_4sc.h"
 #include "arm_solve_convert.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
index f2963b8..f6854b7 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
index edadced..94dfcdc 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_6sc.h"
 #include "arm_solve_convert.h"
@@ -472,7 +472,7 @@ armral_status armral_solve_4x4_6sc_f32(
                        1. / (1U << (16 + p_y_num_fract_bits[2])),
                        1. / (1U << (16 + p_y_num_fract_bits[3]))};
 
-  float x_mult = 1 << num_fract_bits_x;
+  float32_t x_mult = 1 << num_fract_bits_x;
   // The loop is unrolled so that 2 matrices are computed per iteration,
   // therefore 12 subcarriers (i.e. y vectors) are used in each iteration
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
index 52bcf9d..249dbae 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_convert.h b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
index acc8842..cd8cb13 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_convert.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -62,7 +62,7 @@ armral_convert_f32_i16_x8(const armral_fixed_point_index num_fract_bits,
                           const float32x4_t *in_vector1_re,
                           const float32x4_t *in_vector1_im,
                           int16x8_t *out_vector0, int16x8_t *out_vector1) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re[0] * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im[0] * x_mult);
@@ -93,7 +93,7 @@ armral_convert_f32_i16_x8_2(const armral_fixed_point_index num_fract_bits,
                             const float32x4_t *in_vector0_re,
                             const float32x4_t *in_vector0_im,
                             int16x8_t *out_vector0) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re[0] * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im[0] * x_mult);
@@ -114,7 +114,7 @@ armral_convert_f32_i16_x4(const armral_fixed_point_index num_fract_bits,
                           float32x4_t in_vector0_re, float32x4_t in_vector0_im,
                           float32x4_t in_vector1_re, float32x4_t in_vector1_im,
                           int16x8_t *out_vector0, int16x8_t *out_vector1) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im * x_mult);
@@ -134,7 +134,7 @@ static inline void __attribute__((always_inline))
 armral_convert_f32_i16_x4_2(const armral_fixed_point_index num_fract_bits,
                             float32x4_t in_vector0_re,
                             float32x4_t in_vector0_im, int16x8_t *out_vector0) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
 
   int32x4_t res00_re = vcvtq_s32_f32(in_vector0_re * x_mult);
   int32x4_t res00_im = vcvtq_s32_f32(in_vector0_im * x_mult);
@@ -152,7 +152,7 @@ armral_convert_f32_fixed_i16(const armral_fixed_point_index num_fract_bits,
                              const svfloat32_t in_vector_re,
                              const svfloat32_t in_vector_im,
                              const svbool_t pg32) {
-  float x_mult = 1 << num_fract_bits;
+  float32_t x_mult = 1 << num_fract_bits;
   svfloat32_t in32_re = svmul_n_f32_x(pg32, in_vector_re, x_mult);
   svfloat32_t in32_im = svmul_n_f32_x(pg32, in_vector_im, x_mult);
   svint32_t res_re = svcvt_s32_f32_x(pg32, in32_re);
@@ -169,4 +169,4 @@ armral_convert_f32_i16(const svfloat32_t in_vector_re,
   svint16_t res_16_re = svqxtnb_s32(res_re);
   return svqxtnt_s32(res_16_re, res_im);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_f32.c b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
index 75dfecd..62cf87a 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_4sc.h"
diff --git a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
index 999db43..ebca77c 100644
--- a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+++ b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
@@ -24,12 +24,17 @@ void left_pseudo_inverse(uint16_t m, const float32_t lambda,
   armral_cmplx_mat_mult_ahb_f32(m, n, n, p_src, p_src, mat_aha);
 
   // Compute C += lambda * I
-  armral::cmplx_mat_pseudo_inv::add_lambda<n>(lambda, p_dst);
+  armral::cmplx_mat_pseudo_inv::add_lambda<n>(lambda, mat_aha);
 
   // Compute B = C^(-1)
   auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n * n);
-  armral::cmplx_herm_mat_inv::invert_hermitian_matrix<n>(mat_aha,
-                                                         mat_inv.get());
+  if constexpr (n == 1) {
+    mat_inv[0].re = 1.F / mat_aha[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    armral::cmplx_herm_mat_inv::invert_hermitian_matrix<n>(mat_aha,
+                                                           mat_inv.get());
+  }
 
   // Compute B * A^H
   armral::cmplx_mat_pseudo_inv::mat_mult_bah_f32(m, n, p_src, mat_inv.get(),
@@ -51,8 +56,13 @@ void right_pseudo_inverse(uint16_t n, const float32_t lambda,
 
   // Compute B = C^(-1)
   auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, m * m);
-  armral::cmplx_herm_mat_inv::invert_hermitian_matrix<m>(mat_aah,
-                                                         mat_inv.get());
+  if constexpr (m == 1) {
+    mat_inv[0].re = 1.F / mat_aah[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    armral::cmplx_herm_mat_inv::invert_hermitian_matrix<m>(mat_aah,
+                                                           mat_inv.get());
+  }
 
   // Compute A^H * B
   armral_cmplx_mat_mult_ahb_f32(m, n, m, p_src, mat_inv.get(), p_dst);
@@ -73,6 +83,10 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda,
   // columns then use the left pseudo-inverse
   if (m > n) {
     switch (n) {
+    case 1: {
+      left_pseudo_inverse<1>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
     case 2: {
       left_pseudo_inverse<2>(m, lambda, p_src, p_dst, allocator);
       break;
@@ -103,6 +117,10 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda,
   // If the number of rows in the input matrix is less than or equal to the number
   // of columns then use the right pseudo-inverse
   switch (m) {
+  case 1: {
+    right_pseudo_inverse<1>(n, lambda, p_src, p_dst, allocator);
+    break;
+  }
   case 2: {
     right_pseudo_inverse<2>(n, lambda, p_src, p_dst, allocator);
     break;
diff --git a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
index 04525b9..c4d8071 100644
--- a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
+++ b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace armral::cmplx_mat_pseudo_inv {
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
index 9813462..9d95aa2 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -21,10 +21,10 @@ armral_cmplx_vecdot_f32(int32_t n, const armral_cmplx_f32_t *restrict p_src_a,
   int32_t i = 0;
   for (; i * num_lanes <= n - 2 * num_lanes; i += 2) {
     svbool_t pg = svptrue_b32();
-    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float *)p_src_b, i);
-    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float *)p_src_a, i + 1);
-    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float *)p_src_b, i + 1);
+    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
+    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i + 1);
+    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i + 1);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_a0, vec_b0, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_a0, vec_b0, 90);
@@ -34,8 +34,8 @@ armral_cmplx_vecdot_f32(int32_t n, const armral_cmplx_f32_t *restrict p_src_a,
 
   for (; i * num_lanes < n; ++i) {
     svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * n);
-    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float *)p_src_b, i);
+    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_a, vec_b, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_a, vec_b, 90);
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
index 4549d4a..4a2da77 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
@@ -8,11 +8,12 @@
 #endif
 
 armral_status armral_cmplx_vecdot_f32_2(int32_t n,
-                                        const float *restrict p_src_a_re,
-                                        const float *restrict p_src_a_im,
-                                        const float *restrict p_src_b_re,
-                                        const float *restrict p_src_b_im,
-                                        float *p_src_c_re, float *p_src_c_im) {
+                                        const float32_t *restrict p_src_a_re,
+                                        const float32_t *restrict p_src_a_im,
+                                        const float32_t *restrict p_src_b_re,
+                                        const float32_t *restrict p_src_b_im,
+                                        float32_t *p_src_c_re,
+                                        float32_t *p_src_c_im) {
 #ifdef ARMRAL_ARCH_SVE
   int32_t num_lanes = svcntw();
   int32_t full_vectors = n / num_lanes;
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
index 82ff5c4..8005432 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
index 36d4043..23516b9 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
index 1bceca5..acf15f3 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
index 9a40c00..0eb7ab5 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
index a6660f2..36de2e3 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
@@ -17,14 +17,14 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
   svbool_t pg = svptrue_b32();
 
   for (int32_t i = 0; i < full_vectors; i++) {
-    svfloat32_t vec_a = svld1_f32(pg, (const float *)a);
-    svfloat32_t vec_b = svld1_f32(pg, (const float *)b);
+    svfloat32_t vec_a = svld1_f32(pg, (const float32_t *)a);
+    svfloat32_t vec_b = svld1_f32(pg, (const float32_t *)b);
     svfloat32_t vec_c = svdup_n_f32(0);
 
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 0);
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 90);
 
-    svst1_f32(pg, (float *)c, vec_c);
+    svst1_f32(pg, (float32_t *)c, vec_c);
 
     a += num_64bit_lanes;
     b += num_64bit_lanes;
@@ -35,23 +35,23 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
   if (tail_size) {
     pg = svwhilelt_b32(0, 2 * tail_size);
 
-    svfloat32_t vec_a = svld1_f32(pg, (const float *)a);
-    svfloat32_t vec_b = svld1_f32(pg, (const float *)b);
+    svfloat32_t vec_a = svld1_f32(pg, (const float32_t *)a);
+    svfloat32_t vec_b = svld1_f32(pg, (const float32_t *)b);
     svfloat32_t vec_c = svdup_n_f32(0);
 
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 0);
     vec_c = svcmla_f32_x(pg, vec_c, vec_a, vec_b, 90);
 
-    svst1_f32(pg, (float *)c, vec_c);
+    svst1_f32(pg, (float32_t *)c, vec_c);
   }
 
   return ARMRAL_SUCCESS;
 #else
   uint32_t blk_cnt; /* Loop counter */
-  float re_a;
-  float im_a;
-  float re_b;
-  float im_b; /* Temporary variables to store real and imaginary values */
+  float32_t re_a;
+  float32_t im_a;
+  float32_t re_b;
+  float32_t im_b; /* Temporary variables to store real and imaginary values */
 
   float32x4x2_t va;
   float32x4x2_t vb;
@@ -62,8 +62,8 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
 
   while (blk_cnt > 0U) {
     // load & separate real/imag (de-interleave 2)
-    va = vld2q_f32((const float *)a);
-    vb = vld2q_f32((const float *)b);
+    va = vld2q_f32((const float32_t *)a);
+    vb = vld2q_f32((const float32_t *)b);
 
     /* Increment pointers */
     a += 4;
@@ -77,7 +77,7 @@ armral_status armral_cmplx_vecmul_f32(int32_t n,
     out_cplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
     out_cplx.val[1] = vfmaq_f32(out_cplx.val[1], va.val[1], vb.val[0]);
 
-    vst2q_f32((float *)c, out_cplx);
+    vst2q_f32((float32_t *)c, out_cplx);
 
     /* Increment pointer */
     c += 4;
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
index ac068af..5357eb7 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
@@ -1,17 +1,18 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
 #include <arm_sve.h>
 #endif
 
-armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *restrict a_re,
-                                        const float *restrict a_im,
-                                        const float *restrict b_re,
-                                        const float *restrict b_im, float *c_re,
-                                        float *c_im) {
+armral_status armral_cmplx_vecmul_f32_2(int32_t n,
+                                        const float32_t *restrict a_re,
+                                        const float32_t *restrict a_im,
+                                        const float32_t *restrict b_re,
+                                        const float32_t *restrict b_im,
+                                        float32_t *c_re, float32_t *c_im) {
 #ifdef ARMRAL_ARCH_SVE
   int32_t num_lanes = svcntw();
   svbool_t pg = svptrue_b32();
@@ -87,10 +88,10 @@ armral_status armral_cmplx_vecmul_f32_2(int32_t n, const float *restrict a_re,
   return ARMRAL_SUCCESS;
 #else
   uint32_t blk_cnt; /* Loop counter */
-  float re_a;
-  float im_a;
-  float re_b;
-  float im_b; /* Temporary variables to store real and imaginary values */
+  float32_t re_a;
+  float32_t im_a;
+  float32_t re_b;
+  float32_t im_b; /* Temporary variables to store real and imaginary values */
 
   float32x4x2_t vc_re;
   float32x4x2_t vc_im;
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
index 617ddba..373c25f 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
index 322e0cd..5e13b7f 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
index 34e0c2d..76d179f 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
index 582de2f..623c6ba 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
@@ -1,11 +1,12 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
+#include "../bit_unpacking_common.hpp"
 #include "utils/vec_mul.hpp"
 
-#include <stdlib.h>
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
 #endif
@@ -433,141 +434,7 @@ armral_status armral_mu_law_decompr_9bit(const uint32_t n_prb,
   }
   return ARMRAL_SUCCESS;
 #else
-  int16x8x2_t scale_v;
-  if (scale != nullptr) {
-    scale_v.val[0] = vdupq_n_s16(scale->re);
-    scale_v.val[1] = vdupq_n_s16(scale->im);
-  }
-
-  for (uint32_t i = 0; i < n_prb; i++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-    int16_t shift = src->exp;
-
-    // ABCDEFGH
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-    // BCDEFGHI
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    int8x8_t left_shifts = {0, 1, 2, 3, 4, 5, 6, 7};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    uint16x8_t a_left = vshll_n_u8(vshl_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vshl_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vshl_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-
-    int16x8x3_t prb_comp_in;
-    prb_comp_in.val[0] = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> 7;
-    prb_comp_in.val[1] = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> 7;
-    prb_comp_in.val[2] = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> 7;
-
-    // Extract the sign bit and absolute values for the PRB
-    int16x8x3_t prb_signs;
-    prb_signs.val[0] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[0]), vcgezq_s16(prb_comp_in.val[0])));
-    prb_signs.val[1] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[1]), vcgezq_s16(prb_comp_in.val[1])));
-    prb_signs.val[2] = vreinterpretq_s16_u16(vsubq_u16(
-        vcltzq_s16(prb_comp_in.val[2]), vcgezq_s16(prb_comp_in.val[2])));
-
-    int16x8x3_t prb_comp_abs;
-    int16x8_t sat_pos = vdupq_n_s16(255);
-    prb_comp_abs.val[0] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[0]));
-    prb_comp_abs.val[1] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[1]));
-    prb_comp_abs.val[2] = vminq_s16(sat_pos, vqabsq_s16(prb_comp_in.val[2]));
-
-    // Expand each sample, absBitWidth=15, compBitWidth=9
-    uint16x8x3_t check_thr1;
-    uint16x8x3_t check_thr3;
-
-    // Expand - First Step: Set bitmasks based on prbCompAbs values
-    // Check1: if prbCompAbs <= 2^(compBitWidth - 2) = 128
-    int16x8_t thr1_b9 = vdupq_n_s16(128);
-    check_thr1.val[0] = vcleq_s16(prb_comp_abs.val[0], thr1_b9);
-    check_thr1.val[1] = vcleq_s16(prb_comp_abs.val[1], thr1_b9);
-    check_thr1.val[2] = vcleq_s16(prb_comp_abs.val[2], thr1_b9);
-
-    // Check3: if prbCompAbs > (2^(compBitWidth - 2) + 2^(compBitWidth - 3))
-    int16x8_t thr2_b9 = vdupq_n_s16(192);
-    check_thr3.val[0] = vcgtq_s16(prb_comp_abs.val[0], thr2_b9);
-    check_thr3.val[1] = vcgtq_s16(prb_comp_abs.val[1], thr2_b9);
-    check_thr3.val[2] = vcgtq_s16(prb_comp_abs.val[2], thr2_b9);
-
-    // Expand - Second Step: Perform decompression calculation
-    int16x8x3_t prb_abs_res1;
-    int16x8x3_t prb_abs_res2;
-    int16x8x3_t prb_abs_res3;
-
-    // Case1: prbAbsRes1 = prbCompAbs * 2^(input_bits - output_bits)
-    //  input_bits - output_bits = 6
-    prb_abs_res1.val[0] = vqshlq_n_s16(prb_comp_abs.val[0], 6);
-    prb_abs_res1.val[1] = vqshlq_n_s16(prb_comp_abs.val[1], 6);
-    prb_abs_res1.val[2] = vqshlq_n_s16(prb_comp_abs.val[2], 6);
-
-    // Case2: prbAbsRes2 = prbCompAbs * 2^(input_bits - output_bits + 1) - 2^13
-    //  input_bits - output_bits + 1 = 7
-    int16x8_t sub_thr2_b9 = vdupq_n_s16(8192);
-    prb_abs_res2.val[0] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[0], 7), sub_thr2_b9);
-    prb_abs_res2.val[1] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[1], 7), sub_thr2_b9);
-    prb_abs_res2.val[2] =
-        vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[2], 7), sub_thr2_b9);
-
-    // Case3: prbAbsRes3 = prbCompAbs * 2^(absBitWidth - compBitWidth + 2) -
-    // 2^15
-    //  input_bits - output_bits + 2 = 8
-    uint16x8_t sub_comm_b9 = vdupq_n_u16(32768);
-    prb_abs_res3.val[0] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[0], 8), sub_comm_b9));
-    prb_abs_res3.val[1] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[1], 8), sub_comm_b9));
-    prb_abs_res3.val[2] = vreinterpretq_s16_u16(
-        vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[2], 8), sub_comm_b9));
-
-    // Expand - Fourth Step: OR among prbAbsRes vectors
-    int16x8x3_t exp_samples;
-    exp_samples.val[0] = vbslq_s16(
-        check_thr1.val[0], prb_abs_res1.val[0],
-        vbslq_s16(check_thr3.val[0], prb_abs_res3.val[0], prb_abs_res2.val[0]));
-    exp_samples.val[1] = vbslq_s16(
-        check_thr1.val[1], prb_abs_res1.val[1],
-        vbslq_s16(check_thr3.val[1], prb_abs_res3.val[1], prb_abs_res2.val[1]));
-    exp_samples.val[2] = vbslq_s16(
-        check_thr1.val[2], prb_abs_res1.val[2],
-        vbslq_s16(check_thr3.val[2], prb_abs_res3.val[2], prb_abs_res2.val[2]));
-
-    // Apply sign and shift
-    exp_samples.val[0] = vmulq_s16(exp_samples.val[0], prb_signs.val[0]);
-    exp_samples.val[1] = vmulq_s16(exp_samples.val[1], prb_signs.val[1]);
-    exp_samples.val[2] = vmulq_s16(exp_samples.val[2], prb_signs.val[2]);
-
-    int16x8_t comp_shift_vec = vdupq_n_s16(-shift);
-
-    exp_samples.val[0] = vshlq_s16(exp_samples.val[0], comp_shift_vec);
-    exp_samples.val[1] = vshlq_s16(exp_samples.val[1], comp_shift_vec);
-    exp_samples.val[2] = vshlq_s16(exp_samples.val[2], comp_shift_vec);
-
-    if (scale != nullptr) {
-      scale_and_store3_cmplx((int16_t *)dst, exp_samples.val[0],
-                             exp_samples.val[1], exp_samples.val[2], scale_v);
-      dst += 12;
-    } else {
-      vst1q_s16((int16_t *)dst, exp_samples.val[0]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, exp_samples.val[1]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, exp_samples.val[2]);
-      dst += 4;
-    }
-    src++;
-  }
+  common_decompr_9bit_neon<true, false>(n_prb, src, dst, scale);
   return ARMRAL_SUCCESS;
 #endif
 }
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
index 310c017..835a108 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
index 7d41c34..495ec44 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
@@ -1,12 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
 #endif
 
+#include "../bit_unpacking_common.hpp"
 #include "utils/vec_mul.hpp"
 
 armral_status armral_block_float_decompr_8bit(
@@ -190,58 +192,7 @@ armral_status armral_block_float_decompr_9bit(
   }
   return ARMRAL_SUCCESS;
 #else
-  for (uint32_t num_prb = 0; num_prb < n_prb; num_prb++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-    int16_t exp = src->exp;
-
-    // ABCDEFGH
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-    // BCDEFGHI
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    uint8x8_t left_shifts = {1, 2, 4, 8, 16, 32, 64, 128};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    // e.g. for second lanes (x07=B, x18=C), extracting 9-bit bbbbbbbbb
-    //  x07[1] = abbbbbbb
-    //  x18[1] = bbcccccc
-    // > vshl_u8(x07, left_shifts)  = vshl_u8(abbbbbbb,  1) = bbbbbbb0
-    // > vshl_u8(x18, right_shifts) = vshl_u8(bbcccccc, -6) = 000000bb
-    // > vshll_n_u8(vshl_u8(x07, left_shifts), 8)  = bbbbbbb0_00000000
-    // > vshll_n_u8(vshl_u8(x18, right_shifts), 7) = 0000000b_b0000000
-    // note how we have populated the sign-bit of the 16-bit lane. A shift
-    // right (dealing with the block exponent) therefore preserves the sign
-    // of the original value as expected.
-    uint16x8_t a_left = vshll_n_u8(vmul_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vmul_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vmul_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-    int16x8_t a_comb = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right));
-    int16x8_t b_comb = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right));
-    int16x8_t c_comb = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right));
-    a_comb = a_comb >> (7 - exp);
-    b_comb = b_comb >> (7 - exp);
-    c_comb = c_comb >> (7 - exp);
-
-    if (scale != nullptr) {
-      vst1q_s16((int16_t *)&dst[0], cmplx_mul_combined_re_im(a_comb, *scale));
-      vst1q_s16((int16_t *)&dst[4], cmplx_mul_combined_re_im(b_comb, *scale));
-      vst1q_s16((int16_t *)&dst[8], cmplx_mul_combined_re_im(c_comb, *scale));
-    } else {
-      vst1q_s16((int16_t *)&dst[0], a_comb);
-      vst1q_s16((int16_t *)&dst[4], b_comb);
-      vst1q_s16((int16_t *)&dst[8], c_comb);
-    }
-
-    src++;
-    dst += 12;
-  }
+  common_decompr_9bit_neon<false, true>(n_prb, src, dst, scale);
   return ARMRAL_SUCCESS;
 #endif
 }
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
index 7d96bf4..14f7488 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
index c8fb82a..3de46c3 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
@@ -1,11 +1,12 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+
+#include "../bit_unpacking_common.hpp"
 #include "intrinsics.h"
 #include "utils/vec_mul.hpp"
-#include <math.h>
 
 #if ARMRAL_ARCH_SVE >= 2
 #include <arm_sve.h>
@@ -174,60 +175,7 @@ armral_status armral_block_scaling_decompr_9bit(
     src++;
   }
 #else
-  for (unsigned int i = 0; i < n_prb; i++) {
-    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
-
-    // Load in the input data byte by byte
-    uint8x8_t a07 = vld1_u8(&data_in[0]);
-    uint8x8_t b07 = vld1_u8(&data_in[9]);
-    uint8x8_t c07 = vld1_u8(&data_in[18]);
-
-    uint8x8_t a18 = vld1_u8(&data_in[1]);
-    uint8x8_t b18 = vld1_u8(&data_in[10]);
-    uint8x8_t c18 = vld1_u8(&data_in[19]);
-
-    int8x8_t left_shifts = {0, 1, 2, 3, 4, 5, 6, 7};
-    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
-
-    uint16x8_t a_left = vshll_n_u8(vshl_u8(a07, left_shifts), 8);
-    uint16x8_t b_left = vshll_n_u8(vshl_u8(b07, left_shifts), 8);
-    uint16x8_t c_left = vshll_n_u8(vshl_u8(c07, left_shifts), 8);
-    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
-    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
-    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
-
-    // Get 9bit input elements
-    int16x8_t prb_comp_in[3];
-    prb_comp_in[0] = vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> 7;
-    prb_comp_in[1] = vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> 7;
-    prb_comp_in[2] = vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> 7;
-
-    // Decompression process
-    int16x8_t prb_decomp[3];
-    int16x8_t scaling_factor = vdupq_n_s16(src->exp);
-    prb_decomp[0] = vmulq_s16(prb_comp_in[0], scaling_factor);
-    prb_decomp[1] = vmulq_s16(prb_comp_in[1], scaling_factor);
-    prb_decomp[2] = vmulq_s16(prb_comp_in[2], scaling_factor);
-
-    // Store decompressed data
-    if (scale != nullptr) {
-      vst1q_s16((int16_t *)&dst[0],
-                cmplx_mul_combined_re_im(prb_decomp[0], *scale));
-      vst1q_s16((int16_t *)&dst[4],
-                cmplx_mul_combined_re_im(prb_decomp[1], *scale));
-      vst1q_s16((int16_t *)&dst[8],
-                cmplx_mul_combined_re_im(prb_decomp[2], *scale));
-      dst += 12;
-    } else {
-      vst1q_s16((int16_t *)dst, prb_decomp[0]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, prb_decomp[1]);
-      dst += 4;
-      vst1q_s16((int16_t *)dst, prb_decomp[2]);
-      dst += 4;
-    }
-    src++;
-  }
+  common_decompr_9bit_neon<false, false>(n_prb, src, dst, scale);
 #endif
   return ARMRAL_SUCCESS;
 }
diff --git a/src/DuRuInterface/bit_packing_common.hpp b/src/DuRuInterface/bit_packing_common.hpp
index 2c7af3b..070318a 100644
--- a/src/DuRuInterface/bit_packing_common.hpp
+++ b/src/DuRuInterface/bit_packing_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/DuRuInterface/bit_unpacking_common.hpp b/src/DuRuInterface/bit_unpacking_common.hpp
new file mode 100644
index 0000000..d30c7dd
--- /dev/null
+++ b/src/DuRuInterface/bit_unpacking_common.hpp
@@ -0,0 +1,185 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include "utils/vec_mul.hpp"
+
+namespace {
+
+inline void mu_law_decomp_and_store_9bit_neon(int16x8x3_t &prb_comp,
+                                              armral_cmplx_int16_t *dst,
+                                              const armral_cmplx_int16_t *scale,
+                                              int16_t shift) {
+  // Extract the sign bit and absolute values for the PRB
+  int16x8x3_t prb_signs;
+  prb_signs.val[0] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[0]), vcgezq_s16(prb_comp.val[0])));
+  prb_signs.val[1] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[1]), vcgezq_s16(prb_comp.val[1])));
+  prb_signs.val[2] = vreinterpretq_s16_u16(
+      vsubq_u16(vcltzq_s16(prb_comp.val[2]), vcgezq_s16(prb_comp.val[2])));
+
+  int16x8x3_t prb_comp_abs;
+  int16x8_t sat_pos = vdupq_n_s16(255);
+  prb_comp_abs.val[0] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[0]));
+  prb_comp_abs.val[1] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[1]));
+  prb_comp_abs.val[2] = vminq_s16(sat_pos, vqabsq_s16(prb_comp.val[2]));
+
+  // Expand each sample, absBitWidth=15, compBitWidth=9
+  uint16x8x3_t check_thr1;
+  uint16x8x3_t check_thr3;
+
+  // Expand - First Step: Set bitmasks based on prbCompAbs values
+  // Check1: if prbCompAbs <= 2^(compBitWidth - 2) = 128
+  int16x8_t thr1_b9 = vdupq_n_s16(128);
+  check_thr1.val[0] = vcleq_s16(prb_comp_abs.val[0], thr1_b9);
+  check_thr1.val[1] = vcleq_s16(prb_comp_abs.val[1], thr1_b9);
+  check_thr1.val[2] = vcleq_s16(prb_comp_abs.val[2], thr1_b9);
+
+  // Check3: if prbCompAbs > (2^(compBitWidth - 2) + 2^(compBitWidth - 3))
+  int16x8_t thr2_b9 = vdupq_n_s16(192);
+  check_thr3.val[0] = vcgtq_s16(prb_comp_abs.val[0], thr2_b9);
+  check_thr3.val[1] = vcgtq_s16(prb_comp_abs.val[1], thr2_b9);
+  check_thr3.val[2] = vcgtq_s16(prb_comp_abs.val[2], thr2_b9);
+
+  // Expand - Second Step: Perform decompression calculation
+  int16x8x3_t prb_abs_res1;
+  int16x8x3_t prb_abs_res2;
+  int16x8x3_t prb_abs_res3;
+
+  // Case1: prbAbsRes1 = prbCompAbs * 2^(input_bits - output_bits)
+  //  input_bits - output_bits = 6
+  prb_abs_res1.val[0] = vqshlq_n_s16(prb_comp_abs.val[0], 6);
+  prb_abs_res1.val[1] = vqshlq_n_s16(prb_comp_abs.val[1], 6);
+  prb_abs_res1.val[2] = vqshlq_n_s16(prb_comp_abs.val[2], 6);
+
+  // Case2: prbAbsRes2 = prbCompAbs * 2^(input_bits - output_bits + 1) - 2^13
+  //  input_bits - output_bits + 1 = 7
+  int16x8_t sub_thr2_b9 = vdupq_n_s16(8192);
+  prb_abs_res2.val[0] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[0], 7), sub_thr2_b9);
+  prb_abs_res2.val[1] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[1], 7), sub_thr2_b9);
+  prb_abs_res2.val[2] =
+      vsubq_s16(vqshlq_n_s16(prb_comp_abs.val[2], 7), sub_thr2_b9);
+
+  // Case3: prbAbsRes3 = prbCompAbs * 2^(absBitWidth - compBitWidth + 2) -
+  // 2^15
+  //  input_bits - output_bits + 2 = 8
+  uint16x8_t sub_comm_b9 = vdupq_n_u16(32768);
+  prb_abs_res3.val[0] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[0], 8), sub_comm_b9));
+  prb_abs_res3.val[1] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[1], 8), sub_comm_b9));
+  prb_abs_res3.val[2] = vreinterpretq_s16_u16(
+      vsubq_u16(vqshluq_n_s16(prb_comp_abs.val[2], 8), sub_comm_b9));
+
+  // Expand - Fourth Step: OR among prbAbsRes vectors
+  int16x8x3_t exp_samples;
+  exp_samples.val[0] = vbslq_s16(
+      check_thr1.val[0], prb_abs_res1.val[0],
+      vbslq_s16(check_thr3.val[0], prb_abs_res3.val[0], prb_abs_res2.val[0]));
+  exp_samples.val[1] = vbslq_s16(
+      check_thr1.val[1], prb_abs_res1.val[1],
+      vbslq_s16(check_thr3.val[1], prb_abs_res3.val[1], prb_abs_res2.val[1]));
+  exp_samples.val[2] = vbslq_s16(
+      check_thr1.val[2], prb_abs_res1.val[2],
+      vbslq_s16(check_thr3.val[2], prb_abs_res3.val[2], prb_abs_res2.val[2]));
+
+  // Apply sign and shift
+  exp_samples.val[0] = vmulq_s16(exp_samples.val[0], prb_signs.val[0]);
+  exp_samples.val[1] = vmulq_s16(exp_samples.val[1], prb_signs.val[1]);
+  exp_samples.val[2] = vmulq_s16(exp_samples.val[2], prb_signs.val[2]);
+
+  int16x8_t comp_shift_vec = vdupq_n_s16(-shift);
+
+  exp_samples.val[0] = vshlq_s16(exp_samples.val[0], comp_shift_vec);
+  exp_samples.val[1] = vshlq_s16(exp_samples.val[1], comp_shift_vec);
+  exp_samples.val[2] = vshlq_s16(exp_samples.val[2], comp_shift_vec);
+
+  // Store
+  if (scale != nullptr) {
+    int16x8x2_t scale_v;
+    scale_v.val[0] = vdupq_n_s16(scale->re);
+    scale_v.val[1] = vdupq_n_s16(scale->im);
+    scale_and_store3_cmplx((int16_t *)dst, exp_samples.val[0],
+                           exp_samples.val[1], exp_samples.val[2], scale_v);
+  } else {
+    vst1q_s16((int16_t *)&dst[0], exp_samples.val[0]);
+    vst1q_s16((int16_t *)&dst[4], exp_samples.val[1]);
+    vst1q_s16((int16_t *)&dst[8], exp_samples.val[2]);
+  }
+}
+
+template<bool is_mu_law, bool is_block_float>
+void common_decompr_9bit_neon(uint32_t n_prb,
+                              const armral_compressed_data_9bit *src,
+                              armral_cmplx_int16_t *dst,
+                              const armral_cmplx_int16_t *scale) {
+  for (uint32_t i = 0; i < n_prb; i++) {
+    int16_t exp = src->exp;
+    const uint8_t *data_in = (const uint8_t *)&src->mantissa[0];
+
+    // Load in the input data byte by byte
+    // ABCDEFGH
+    uint8x8_t a07 = vld1_u8(&data_in[0]);
+    uint8x8_t b07 = vld1_u8(&data_in[9]);
+    uint8x8_t c07 = vld1_u8(&data_in[18]);
+    // BCDEFGHI
+    uint8x8_t a18 = vld1_u8(&data_in[1]);
+    uint8x8_t b18 = vld1_u8(&data_in[10]);
+    uint8x8_t c18 = vld1_u8(&data_in[19]);
+
+    uint8x8_t left_shifts = {1, 2, 4, 8, 16, 32, 64, 128};
+    int8x8_t right_shifts = {-7, -6, -5, -4, -3, -2, -1, 0};
+
+    uint16x8_t a_left = vshll_n_u8(vmul_u8(a07, left_shifts), 8);
+    uint16x8_t b_left = vshll_n_u8(vmul_u8(b07, left_shifts), 8);
+    uint16x8_t c_left = vshll_n_u8(vmul_u8(c07, left_shifts), 8);
+    uint16x8_t a_right = vshll_n_u8(vshl_u8(a18, right_shifts), 7);
+    uint16x8_t b_right = vshll_n_u8(vshl_u8(b18, right_shifts), 7);
+    uint16x8_t c_right = vshll_n_u8(vshl_u8(c18, right_shifts), 7);
+
+    int16x8x3_t prb_comp;
+    int16_t shift = 7;
+    if constexpr (is_block_float) {
+      shift -= exp;
+    }
+    prb_comp.val[0] =
+        vreinterpretq_s16_u16(vorrq_u16(a_left, a_right)) >> shift;
+    prb_comp.val[1] =
+        vreinterpretq_s16_u16(vorrq_u16(b_left, b_right)) >> shift;
+    prb_comp.val[2] =
+        vreinterpretq_s16_u16(vorrq_u16(c_left, c_right)) >> shift;
+
+    if constexpr (is_mu_law) {
+      mu_law_decomp_and_store_9bit_neon(prb_comp, dst, scale, exp);
+    } else {
+      if constexpr (not is_block_float) { // Block Scaling
+        prb_comp.val[0] = vmulq_n_s16(prb_comp.val[0], exp);
+        prb_comp.val[1] = vmulq_n_s16(prb_comp.val[1], exp);
+        prb_comp.val[2] = vmulq_n_s16(prb_comp.val[2], exp);
+      }
+
+      // Block Float and Scaling store
+      if (scale != nullptr) {
+        vst1q_s16((int16_t *)&dst[0],
+                  cmplx_mul_combined_re_im(prb_comp.val[0], *scale));
+        vst1q_s16((int16_t *)&dst[4],
+                  cmplx_mul_combined_re_im(prb_comp.val[1], *scale));
+        vst1q_s16((int16_t *)&dst[8],
+                  cmplx_mul_combined_re_im(prb_comp.val[2], *scale));
+      } else {
+        vst1q_s16((int16_t *)&dst[0], prb_comp.val[0]);
+        vst1q_s16((int16_t *)&dst[4], prb_comp.val[1]);
+        vst1q_s16((int16_t *)&dst[8], prb_comp.val[2]);
+      }
+    }
+    dst += 12;
+    src++;
+  }
+}
+
+} // namespace
\ No newline at end of file
diff --git a/src/LowerPHY/Correlation/arm_correlation.c b/src/LowerPHY/Correlation/arm_correlation.c
index 71dce49..85cca8c 100644
--- a/src/LowerPHY/Correlation/arm_correlation.c
+++ b/src/LowerPHY/Correlation/arm_correlation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp
index 830bb02..28cbe6f 100644
--- a/src/LowerPHY/FFT/fft_cf32.cpp
+++ b/src/LowerPHY/FFT/fft_cf32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_execute.hpp"
 #include "fft_plan.hpp"
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
index bf5176b..72cc33c 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gs.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
index 98033b7..ba99f76 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
index 23c5797..c70d093 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
index 8edbe46..f6bf005 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
index a61ff10..1b42288 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
index 57014ea..3c6966f 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
index cd7b9b1..84ba9f8 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
index 9b78818..dd75a55 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
index 53ef283..72e53ca 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_t_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
index 37e39fd..88f8678 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
index 86d9544..d46ed9d 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ab_t_gu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
index aaba874..896851e 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
index 33d7282..22ab0d6 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
index 8bbb2de..85fe5a0 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
index cac45e5..d656718 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cf32_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
index 9f0f294..fcef99b 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp
index 2da312a..c9cb8f7 100644
--- a/src/LowerPHY/FFT/fft_cs16.cpp
+++ b/src/LowerPHY/FFT/fft_cs16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
index ecb6566..d00d376 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cf32_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
index fe8b750..8b47abc 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
index 609bf1d..5ccc3e6 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cs16_ac_n_uu.h"
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
index 163f863..756cff9 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
index 20287bc..349d548 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cs16_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
index 8476f0e..864b0d9 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp
index 0231333..f45d07f 100644
--- a/src/LowerPHY/FFT/fft_execute.cpp
+++ b/src/LowerPHY/FFT/fft_execute.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_execute.hpp b/src/LowerPHY/FFT/fft_execute.hpp
index 714d257..4cf5edd 100644
--- a/src/LowerPHY/FFT/fft_execute.hpp
+++ b/src/LowerPHY/FFT/fft_execute.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_helper.h b/src/LowerPHY/FFT/fft_helper.h
index 98f7c51..978ed19 100644
--- a/src/LowerPHY/FFT/fft_helper.h
+++ b/src/LowerPHY/FFT/fft_helper.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_level.cpp b/src/LowerPHY/FFT/fft_level.cpp
index a4402dc..e91517d 100644
--- a/src/LowerPHY/FFT/fft_level.cpp
+++ b/src/LowerPHY/FFT/fft_level.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_level.hpp"
 
diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp
index 06cc0df..fa0e64e 100644
--- a/src/LowerPHY/FFT/fft_level.hpp
+++ b/src/LowerPHY/FFT/fft_level.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp
index 833680a..4b10a61 100644
--- a/src/LowerPHY/FFT/fft_plan.cpp
+++ b/src/LowerPHY/FFT/fft_plan.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_plan.hpp"
 #include "fft_cf32_kernel_lookup.h"
@@ -305,7 +305,7 @@ template<typename Tx, typename Ty, typename Tw>
 int factorize(int n, armral_fft_direction_t dir, int max_levels,
               armral::fft::lev_base_t **levels) {
   // search through the set of supported factors to find a suitable
-  // factorisation, then use that to build the level data structures.
+  // factorization, then use that to build the level data structures.
   int factors[max_levels];
   int num_factors = factorize_descending(n, dir, max_levels, factors);
   if (num_factors == 0) {
diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp
index 4196223..b793eb2 100644
--- a/src/LowerPHY/FFT/fft_plan.hpp
+++ b/src/LowerPHY/FFT/fft_plan.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_types.hpp b/src/LowerPHY/FFT/fft_types.hpp
index 65ebfa6..9cc6199 100644
--- a/src/LowerPHY/FFT/fft_types.hpp
+++ b/src/LowerPHY/FFT/fft_types.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp
index a05479c..1678c6a 100644
--- a/src/LowerPHY/FFT/rader.cpp
+++ b/src/LowerPHY/FFT/rader.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rader.hpp"
diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp
index 6d1d21f..bbe53a1 100644
--- a/src/LowerPHY/FFT/rader.hpp
+++ b/src/LowerPHY/FFT/rader.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader_generator.cpp b/src/LowerPHY/FFT/rader_generator.cpp
index 89e1386..9e798f2 100644
--- a/src/LowerPHY/FFT/rader_generator.cpp
+++ b/src/LowerPHY/FFT/rader_generator.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "rader_generator.hpp"
 
diff --git a/src/LowerPHY/FFT/rader_generator.hpp b/src/LowerPHY/FFT/rader_generator.hpp
index bc219d9..49b3cfd 100644
--- a/src/LowerPHY/FFT/rader_generator.hpp
+++ b/src/LowerPHY/FFT/rader_generator.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32.c b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
index 428e231..04ffc61 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -10,8 +10,8 @@
 
 #ifdef ARMRAL_ARCH_SVE
 
-static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float *in,
-                                          const float *coeffs,
+static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float32_t *in,
+                                          const float32_t *coeffs,
                                           uint32_t n_taps) {
   // Compute FIR for four vector-lengths of data. Coeffs array is
   // unrolled by 2 and we have 2 accumulators per vector length, as
@@ -110,10 +110,10 @@ static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float *in,
   return svcreate4(y1, y2, y3, y4);
 }
 
-static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float *in,
-                                          const float *coeffs,
+static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float32_t *in,
+                                          const float32_t *coeffs,
                                           uint32_t n_taps) {
-  // Compute FIR for 2 vector-lengths of data. Lightly optimised - this
+  // Compute FIR for 2 vector-lengths of data. Lightly optimized - this
   // function will be called at most once per call of
   // arm_fir_filter_cf32. Coefficient array is unrolled by factor 2, as
   // for fir_sve_blk, with the difference that we have two accumulators
@@ -167,10 +167,11 @@ static inline svfloat32x2_t fir_sve_blk_2(svbool_t pg, const float *in,
   return svcreate2(y1, y2);
 }
 
-static inline svfloat32_t fir_sve_blk(svbool_t pg, const float *in,
-                                      const float *coeffs, uint32_t n_taps) {
+static inline svfloat32_t fir_sve_blk(svbool_t pg, const float32_t *in,
+                                      const float32_t *coeffs,
+                                      uint32_t n_taps) {
   // Compute FIR for one vector-length of data. This version is not
-  // really optimised, as it is only ever used as the tail of the more
+  // really optimized, as it is only ever used as the tail of the more
   // heavily unrolled versions above. The loop over the coeffs array is
   // unrolled by factor 2, since we can fit 2 complex values in a
   // quad-word.
@@ -210,9 +211,9 @@ armral_status armral_fir_filter_cf32(uint32_t size, uint32_t taps,
   svbool_t ptrue_b32 = svptrue_b32();
   uint32_t x_blk_idx = 0;
   uint32_t xinc = svcntw() * 2;
-  const float *c = (const float *)coeffs;
-  const float *in = (const float *)input;
-  float *out = (float *)output;
+  const float32_t *c = (const float32_t *)coeffs;
+  const float32_t *in = (const float32_t *)input;
+  float32_t *out = (float32_t *)output;
 
   for (; x_blk_idx + xinc * 2 < size * 2; x_blk_idx += xinc * 2) {
     svfloat32x4_t y = fir_sve_blk_4(ptrue_b32, in + x_blk_idx, c, taps);
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
index e60e5e0..ebeef5d 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -31,7 +31,7 @@ static inline void sv_fir_block(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
 
     // ld2_u64 allows us to separate even and odd complex elements, for example
     // if i == 0 then:
@@ -57,7 +57,7 @@ static inline void sv_fir_block(svbool_t pg,
     y = sv_full_cmla(pg, y, c, x);
   }
 
-  svst1_f32(pg, (float *)out, y);
+  svst1_f32(pg, (float32_t *)out, y);
 }
 
 static inline void sv_fir_block_2(svbool_t pg,
@@ -84,7 +84,7 @@ static inline void sv_fir_block_2(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
 
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svuint64x2_t x1 = svld2_vnum_u64(pg, &in[i], 2);
@@ -116,8 +116,8 @@ static inline void sv_fir_block_2(svbool_t pg,
     y_1 = sv_full_cmla(pg, y_1, c, x_1);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
 }
 
 static inline void sv_fir_block_4(svbool_t pg,
@@ -142,7 +142,7 @@ static inline void sv_fir_block_4(svbool_t pg,
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
 
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svfloat32_t x0_0 = svreinterpret_f32_u64(svget2(x0, 0));
     svfloat32_t x1_0 = svreinterpret_f32_u64(svget2(x0, 1));
@@ -221,10 +221,10 @@ static inline void sv_fir_block_4(svbool_t pg,
     y_3 = sv_full_cmla(pg, y_3, c, x_3);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
-  svst1_vnum_f32(pg, (float *)out, 2, y_2);
-  svst1_vnum_f32(pg, (float *)out, 3, y_3);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
+  svst1_vnum_f32(pg, (float32_t *)out, 2, y_2);
+  svst1_vnum_f32(pg, (float32_t *)out, 3, y_3);
 }
 
 static inline void sv_fir_block_8(svbool_t pg,
@@ -248,7 +248,7 @@ static inline void sv_fir_block_8(svbool_t pg,
 
   uint32_t i = 0;
   for (; i + 2 <= taps; i += 2) {
-    svfloat32_t c = svld1rq_f32(pg, (const float *)&coeffs[i]);
+    svfloat32_t c = svld1rq_f32(pg, (const float32_t *)&coeffs[i]);
     svuint64x2_t x0 = svld2_u64(pg, &in[i]);
     svfloat32_t x0_0 = svreinterpret_f32_u64(svget2(x0, 0));
     svfloat32_t x0_1 = svreinterpret_f32_u64(svget2(x0, 1));
@@ -361,14 +361,14 @@ static inline void sv_fir_block_8(svbool_t pg,
     y_7 = sv_full_cmla(pg, y_7, c, x_7);
   }
 
-  svst1_f32(pg, (float *)out, y_0);
-  svst1_vnum_f32(pg, (float *)out, 1, y_1);
-  svst1_vnum_f32(pg, (float *)out, 2, y_2);
-  svst1_vnum_f32(pg, (float *)out, 3, y_3);
-  svst1_vnum_f32(pg, (float *)out, 4, y_4);
-  svst1_vnum_f32(pg, (float *)out, 5, y_5);
-  svst1_vnum_f32(pg, (float *)out, 6, y_6);
-  svst1_vnum_f32(pg, (float *)out, 7, y_7);
+  svst1_f32(pg, (float32_t *)out, y_0);
+  svst1_vnum_f32(pg, (float32_t *)out, 1, y_1);
+  svst1_vnum_f32(pg, (float32_t *)out, 2, y_2);
+  svst1_vnum_f32(pg, (float32_t *)out, 3, y_3);
+  svst1_vnum_f32(pg, (float32_t *)out, 4, y_4);
+  svst1_vnum_f32(pg, (float32_t *)out, 5, y_5);
+  svst1_vnum_f32(pg, (float32_t *)out, 6, y_6);
+  svst1_vnum_f32(pg, (float32_t *)out, 7, y_7);
 }
 
 #endif
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16.c b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
index 71d03c1..c4fa695 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
index 0bb6947..302c446 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -350,7 +350,7 @@ static inline void sv_fir_block(svbool_t pg,
                                 const armral_cmplx_int16_t *restrict coeffs,
                                 armral_cmplx_int16_t *out, uint32_t taps) {
   // Compute FIR on one vector-length of data (read 2 vector-lengths, write 1).
-  // This version is only used as a tail for the more heavily optimised,
+  // This version is only used as a tail for the more heavily optimized,
   // unrolled versions above.
   const uint32_t *in = (const uint32_t *)input;
 
diff --git a/src/LowerPHY/Scrambling/arm_scrambling.cpp b/src/LowerPHY/Scrambling/arm_scrambling.cpp
index 3ff12f6..b9f1812 100644
--- a/src/LowerPHY/Scrambling/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/arm_scrambling.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
index 98452a0..d332880 100644
--- a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+++ b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/SVD/arm_svd.cpp b/src/MatrixFactorizations/SVD/arm_svd.cpp
similarity index 90%
rename from src/SVD/arm_svd.cpp
rename to src/MatrixFactorizations/SVD/arm_svd.cpp
index 1d24eeb..b9e6cb1 100644
--- a/src/SVD/arm_svd.cpp
+++ b/src/MatrixFactorizations/SVD/arm_svd.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -35,10 +35,10 @@ inline void cmplx_vecdot_conj_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
   int32_t i = 0;
   for (; i * num_lanes <= n - 2 * num_lanes; i += 2) {
     svbool_t pg = svptrue_b32();
-    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float *)p_src_b, i);
-    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float *)p_src_a, i + 1);
-    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float *)p_src_b, i + 1);
+    svfloat32_t vec_a0 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b0 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
+    svfloat32_t vec_a1 = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i + 1);
+    svfloat32_t vec_b1 = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i + 1);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_b0, vec_a0, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_b0, vec_a0, 270);
@@ -48,8 +48,8 @@ inline void cmplx_vecdot_conj_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
 
   for (; i * num_lanes < n; ++i) {
     svbool_t pg = svwhilelt_b32(2 * i * num_lanes, 2 * n);
-    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float *)p_src_a, i);
-    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float *)p_src_b, i);
+    svfloat32_t vec_a = svld1_vnum_f32(pg, (const float32_t *)p_src_a, i);
+    svfloat32_t vec_b = svld1_vnum_f32(pg, (const float32_t *)p_src_b, i);
 
     acc0 = svcmla_f32_m(pg, acc0, vec_b, vec_a, 0);
     acc0 = svcmla_f32_m(pg, acc0, vec_b, vec_a, 270);
@@ -218,8 +218,8 @@ inline void cmplx_axmy_f32(int32_t n, const armral_cmplx_f32_t *p_src_a,
 // in division by a small floating point number.
 // Epsilon is taken to be 2^{-(p-1)}/2, p=24 for float.
 // We use the same value of epsilon used in LAPACK.
-const float eps = 5.96046E-08;
-const float safemin = 1.17549E-38 / eps;
+const float32_t eps = 5.96046E-08;
+const float32_t safemin = 1.17549E-38 / eps;
 
 // Compute a * b
 inline armral_cmplx_f32_t mult_cf32(armral_cmplx_f32_t a,
@@ -280,12 +280,12 @@ inline armral_cmplx_f32_t mult_add_cf32(armral_cmplx_f32_t a,
 }
 
 // Compute a * conj(a)
-inline float square_conj_cf32(armral_cmplx_f32_t a) {
+inline float32_t square_conj_cf32(armral_cmplx_f32_t a) {
   return a.re * a.re + a.im * a.im;
 }
 
 inline armral_cmplx_f32_t inv_cf32(armral_cmplx_f32_t a) {
-  float tmp = a.re * a.re + a.im * a.im;
+  float32_t tmp = a.re * a.re + a.im * a.im;
   return {a.re / tmp, -a.im / tmp};
 }
 
@@ -298,7 +298,7 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
 
   armral_cmplx_f32_t alpha = aii;
   // Sum of x[i] * conj(x[i])
-  float sum = 0.0F;
+  float32_t sum = 0.0F;
   for (int i = 0; i < n * incx; i += incx) {
     sum += square_conj_cf32(x[i]);
   }
@@ -310,11 +310,11 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
   // Add alpha * conj(alpha) to sum
   // to compute the 2 norm of the full vector
   sum += square_conj_cf32(alpha);
-  float beta = -copysign(sqrt(sum), alpha.re);
-  float rsafemin = 1.0F / safemin;
+  float32_t beta = -copysign(sqrt(sum), alpha.re);
+  float32_t rsafemin = 1.0F / safemin;
   int cnt = 0;
   int max_attempt = 10;
-  float scale = 1.0F;
+  float32_t scale = 1.0F;
   // Check if beta is small enough to induce
   // overflow when taking the inverse, and
   // if it is the case, scale to avoid overflow
@@ -343,10 +343,10 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
   armral_cmplx_f32_t tau;
   tau.re = (beta - alpha.re) / beta;
   tau.im = -alpha.im / beta;
-  armral_cmplx_f32_t normalisation_factor =
+  armral_cmplx_f32_t normalization_factor =
       inv_cf32({alpha.re - beta, alpha.im});
   for (int i = 0; i < n * incx; i += incx) {
-    x[i] = mult_cf32(normalisation_factor, x[i]);
+    x[i] = mult_cf32(normalization_factor, x[i]);
   }
   beta /= scale;
   aii = {beta, 0.0F};
@@ -354,7 +354,8 @@ inline armral_cmplx_f32_t clarfg(int n, armral_cmplx_f32_t &aii,
 }
 
 // Computation of Givens rotation components.
-inline void rotg(float f, float g, float &cs, float &sn, float &r) {
+inline void rotg(float32_t f, float32_t g, float32_t &cs, float32_t &sn,
+                 float32_t &r) {
   if (f == 0) {
     cs = 0.0F;
     sn = 1.0F;
@@ -362,15 +363,15 @@ inline void rotg(float f, float g, float &cs, float &sn, float &r) {
     return;
   }
   if (std::abs(f) > std::abs(g)) {
-    float t = g / f;
-    float tt = sqrt(1 + t * t);
+    float32_t t = g / f;
+    float32_t tt = sqrt(1 + t * t);
     cs = 1 / tt;
     sn = t / tt;
     r = f * tt;
     return;
   }
-  float t = f / g;
-  float tt = sqrt(1 + t * t);
+  float32_t t = f / g;
+  float32_t tt = sqrt(1 + t * t);
   sn = 1 / tt;
   cs = t / tt;
   r = g * tt;
@@ -379,8 +380,9 @@ inline void rotg(float f, float g, float &cs, float &sn, float &r) {
 // This routine updates singular vectors
 // by applying the Givens rotations
 // used to update the bidiagonal matrix
-inline void update_sigvect(int m, float cs, float sn, armral_cmplx_f32_t *v1,
-                           armral_cmplx_f32_t *v2, int incv) {
+inline void update_sigvect(int m, float32_t cs, float32_t sn,
+                           armral_cmplx_f32_t *v1, armral_cmplx_f32_t *v2,
+                           int incv) {
   for (int i = 0; i < m * incv; i += incv) {
     auto t = v1[i];
     v1[i].re = cs * t.re + sn * v2[i].re;
@@ -390,7 +392,7 @@ inline void update_sigvect(int m, float cs, float sn, armral_cmplx_f32_t *v1,
   }
 }
 
-// householder_qr computes the QR factorisation A = QR.
+// householder_qr computes the QR factorization A = QR.
 // On exit, the elements on and above the diagonal
 // of the A contain the upper triangular matrix R.
 // The elements below the diagonal, with the array tau,
@@ -426,8 +428,8 @@ armral_status armral_householder_qr(int m, int n, armral_cmplx_f32_t *a,
   return ARMRAL_SUCCESS;
 }
 
-// Generate explicitly Q from QR factorisation or from
-// the bidiagonalisation A = Q * B * P^H
+// Generate explicitly Q from QR factorization or from
+// the bidiagonalization A = Q * B * P^H
 armral_status armral_assemble_q(int m, int n, const armral_cmplx_f32_t *a,
                                 const armral_cmplx_f32_t *tau,
                                 armral_cmplx_f32_t *q) {
@@ -469,7 +471,7 @@ armral_status armral_assemble_q(int m, int n, const armral_cmplx_f32_t *a,
 }
 
 // Generate the orthogonal matrix P from
-// the bidiagonalisation  A = Q * B * P^H,
+// the bidiagonalization  A = Q * B * P^H,
 // note that P^H is generated directly
 // instead of P
 void armral_assemble_p(int m, int n, const armral_cmplx_f32_t *a,
@@ -539,8 +541,8 @@ void armral_assemble_p(int m, int n, const armral_cmplx_f32_t *a,
 // the bidiagonal matrix B. Note that this routine
 // returns directly the conjugate transpose of the
 // left orthogonal matrix.
-armral_status armral_bidiagonalisation(int m, int n, armral_cmplx_f32_t *a,
-                                       float *d, float *e,
+armral_status armral_bidiagonalization(int m, int n, armral_cmplx_f32_t *a,
+                                       float32_t *d, float32_t *e,
                                        armral_cmplx_f32_t *tauq,
                                        armral_cmplx_f32_t *taup) {
   if (m < n) {
@@ -622,7 +624,8 @@ armral_status armral_bidiagonalisation(int m, int n, armral_cmplx_f32_t *a,
 // "Singular Value Decomposition and Least Squares Solutions"
 //  published in Numer. Math. 14, 403--420 (1970).
 armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
-                                    float *d, float *e, armral_cmplx_f32_t *u,
+                                    float32_t *d, float32_t *e,
+                                    armral_cmplx_f32_t *u,
                                     armral_cmplx_f32_t *vt, int u_stride) {
 
   if (m < n) {
@@ -638,14 +641,14 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
 
   // Compute the 1-norm of the bidiagonal matrix
   // for the computation of the stopping criteria.
-  float anorm = 0;
+  float32_t anorm = 0;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(d[i]) + std::abs(e[i]);
+    float32_t tmp = std::abs(d[i]) + std::abs(e[i]);
     if (anorm < tmp) {
       anorm = tmp;
     }
   }
-  float tol = anorm * eps;
+  float32_t tol = anorm * eps;
 
   int maxiter = n * n;
   // Loop over the columns
@@ -675,16 +678,16 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
       // In this case, an extra sequence of Givens rotations is
       // applied from the left to annihilate the off-diagonal E[next_col].
       if (diag_is_zero) {
-        float cs = 0.0;
-        float sn = 1.0;
+        float32_t cs = 0.0;
+        float32_t sn = 1.0;
         for (int i = next_col; i < curr_col; i++) {
-          float f = sn * e[i];
+          float32_t f = sn * e[i];
           e[i] *= cs;
           if (std::abs(f) <= tol) {
             break;
           }
-          float g = d[i];
-          float h;
+          float32_t g = d[i];
+          float32_t h;
           rotg(f, g, cs, sn, h);
           d[i] = h;
           // Update left singular vectors.
@@ -694,7 +697,7 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
           }
         }
       }
-      float z = d[curr_col];
+      float32_t z = d[curr_col];
       if (next_col == curr_col) {
         // Make singular value nonnegative and update
         // the corresponding right singular vectors.
@@ -722,20 +725,20 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
       // the 2 eigenvalues are (d1 + d2)/2 +/- sqrt(((d1 - d2)/2)^2 + e1^2).
       // The choice of this shift accelerates the convergence of the
       // most bottom off-diagonal E[curr_col] to zero.
-      float x = d[next_col];
-      float y = d[curr_col - 1];
-      float g = e[curr_col - 1];
-      float h = e[curr_col];
+      float32_t x = d[next_col];
+      float32_t y = d[curr_col - 1];
+      float32_t g = e[curr_col - 1];
+      float32_t h = e[curr_col];
       // a^2 - b^2 operations are computed as
       // (a - b)* (a + b) to avoid overflow.
-      float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      float32_t f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = sqrt(f * f + 1);
       f = ((x - z) * (x + z) + h * (y / (f + copysign(g, f)) - h)) / x;
 
       // Shifted QR iteration, bulge chasing, applying
       // successive Givens rotations from right then from left.
-      float c = 1.0F;
-      float s = 1.0F;
+      float32_t c = 1.0F;
+      float32_t s = 1.0F;
       for (int i = next_col + 1; i <= curr_col; i++) {
         g = e[i];
         y = d[i];
@@ -798,7 +801,7 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the matrix A that has been factorised into QR or bidiagonalisation.
+// as the matrix A that has been factorized into QR or bidiagonalization.
 struct apply_q_work_buffers {
   armral_cmplx_f32_t *q;
 };
@@ -838,7 +841,7 @@ inline armral_status armral_apply_q(int m, int n, const armral_cmplx_f32_t *a,
 // matrix to a triangular form.
 inline int threshold_svd_qr(bool vector_needed, int m, int n) {
 
-  float crossover_point;
+  float32_t crossover_point;
   if (vector_needed) {
     // In this case, the computational complexities are:
     // 14 * m * n^2 + 8 * n^3 for direct svd,
@@ -859,14 +862,14 @@ inline int threshold_svd_qr(bool vector_needed, int m, int n) {
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A in 4 steps.
-// 1- QR factorisation of A.
-// 2- Bidiagonalisation of R.
+// 1- QR factorization of A.
+// 2- Bidiagonalization of R.
 // 3- SVD of the bidiagonal matrix from R.
 // 4- Update of the left singular vectors
 // with the orthogonal matrix from QR.
 template<typename Allocator>
 armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
-                            armral_cmplx_f32_t *a, float *s,
+                            armral_cmplx_f32_t *a, float32_t *s,
                             armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
                             Allocator &allocator) {
 
@@ -876,7 +879,7 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
   auto r = allocate_zeroed<armral_cmplx_f32_t>(allocator, n * n);
   auto tauq = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
   auto taup = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
-  auto e = allocate_uninitialized<float>(allocator, n);
+  auto e = allocate_uninitialized<float32_t>(allocator, n);
 
   // u1 and q have the same type as r, so we can reuse that pointer type.
   using cmplx_ptr = decltype(r);
@@ -904,8 +907,8 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
       r_mat(i, j) = a_mat(i, j);
     }
   }
-  // Bidiagonalisation of R.
-  armral_bidiagonalisation(n, n, r.get(), s, e.get(), tauq.get(), taup.get());
+  // Bidiagonalization of R.
+  armral_bidiagonalization(n, n, r.get(), s, e.get(), tauq.get(), taup.get());
 
   // Generate left and right orthogonal vectors.
   if (maybe_u1.has_value()) {
@@ -921,7 +924,7 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
       }
     }
 
-    // Initialise  last n*(m-n) elements of u
+    // Initialize  last n*(m-n) elements of u
     // to zero in case it is not.
     int remainder = m - n;
     for (int j = 0; j < n; j++) {
@@ -946,18 +949,19 @@ armral_status armral_qr_svd(bool gen_singular_vect, int m, int n,
 
 template<typename Allocator>
 armral_status armral_svd(bool gen_singular_vect, int m, int n,
-                         armral_cmplx_f32_t *a, float *s, armral_cmplx_f32_t *u,
-                         armral_cmplx_f32_t *vt, Allocator &allocator) {
-  // Bidiagonalisation: A = Q * B * P^H.
+                         armral_cmplx_f32_t *a, float32_t *s,
+                         armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
+                         Allocator &allocator) {
+  // Bidiagonalization: A = Q * B * P^H.
   auto tauq = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
   auto taup = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n);
-  auto e = allocate_uninitialized<float>(allocator, n);
+  auto e = allocate_uninitialized<float32_t>(allocator, n);
 
   if constexpr (Allocator::is_counting) {
     return ARMRAL_SUCCESS;
   }
 
-  armral_bidiagonalisation(m, n, a, s, e.get(), tauq.get(), taup.get());
+  armral_bidiagonalization(m, n, a, s, e.get(), tauq.get(), taup.get());
 
   // Generate left and right orthogonal vectors if required.
   if (gen_singular_vect) {
@@ -976,11 +980,11 @@ armral_status armral_svd(bool gen_singular_vect, int m, int n,
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix. It either performs
 // a direct SVD decomposition of the input matrix,
-// or performs QR factorisation first followed
+// or performs QR factorization first followed
 // by the SVD of R depending on the ratio m/n.
 template<typename Allocator>
 armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
-                              armral_cmplx_f32_t *a, float *s,
+                              armral_cmplx_f32_t *a, float32_t *s,
                               armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt,
                               Allocator &allocator) {
 
@@ -995,14 +999,14 @@ armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
 } // anonymous namespace
 
 armral_status armral_svd_cf32(bool gen_singular_vect, int m, int n,
-                              armral_cmplx_f32_t *a, float *s,
+                              armral_cmplx_f32_t *a, float32_t *s,
                               armral_cmplx_f32_t *u, armral_cmplx_f32_t *vt) {
   heap_allocator allocator{};
   return armral_svd_cf32(gen_singular_vect, m, n, a, s, u, vt, allocator);
 }
 
 armral_status armral_svd_cf32_noalloc(bool gen_singular_vect, int m, int n,
-                                      armral_cmplx_f32_t *a, float *s,
+                                      armral_cmplx_f32_t *a, float32_t *s,
                                       armral_cmplx_f32_t *u,
                                       armral_cmplx_f32_t *vt, void *buffer) {
   buffer_bump_allocator allocator{buffer};
diff --git a/src/SVD/matrix_view.hpp b/src/MatrixFactorizations/SVD/matrix_view.hpp
similarity index 83%
rename from src/SVD/matrix_view.hpp
rename to src/MatrixFactorizations/SVD/matrix_view.hpp
index cc2c4d8..6747418 100644
--- a/src/SVD/matrix_view.hpp
+++ b/src/MatrixFactorizations/SVD/matrix_view.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/UpperPHY/CRC/arm_crc11.cpp b/src/UpperPHY/CRC/arm_crc11.cpp
index c65f3b1..d41889d 100644
--- a/src/UpperPHY/CRC/arm_crc11.cpp
+++ b/src/UpperPHY/CRC/arm_crc11.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc16.cpp b/src/UpperPHY/CRC/arm_crc16.cpp
index 42204c3..e727c60 100644
--- a/src/UpperPHY/CRC/arm_crc16.cpp
+++ b/src/UpperPHY/CRC/arm_crc16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_a.cpp b/src/UpperPHY/CRC/arm_crc24_a.cpp
index 3eac9c4..af8e43e 100644
--- a/src/UpperPHY/CRC/arm_crc24_a.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_a.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_b.cpp b/src/UpperPHY/CRC/arm_crc24_b.cpp
index 6de6116..b0e9023 100644
--- a/src/UpperPHY/CRC/arm_crc24_b.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_b.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc24_c.cpp b/src/UpperPHY/CRC/arm_crc24_c.cpp
index 0e5e4a7..42302a5 100644
--- a/src/UpperPHY/CRC/arm_crc24_c.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_c.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/arm_crc6.cpp b/src/UpperPHY/CRC/arm_crc6.cpp
index f907683..0277ba3 100644
--- a/src/UpperPHY/CRC/arm_crc6.cpp
+++ b/src/UpperPHY/CRC/arm_crc6.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
diff --git a/src/UpperPHY/CRC/crc_basic.hpp b/src/UpperPHY/CRC/crc_basic.hpp
index 7c3dfcd..0e6e7df 100644
--- a/src/UpperPHY/CRC/crc_basic.hpp
+++ b/src/UpperPHY/CRC/crc_basic.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index 59460e0..47bf69e 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,13 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include <arm_neon.h>
 
 static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) {
-  // Sometimes compilers don't realise that they don't need an extra
+  // Sometimes compilers don't realize that they don't need an extra
   // instruction to extract the 0th lane of a vector, e.g. when doing
   // vmull_p64(a[0], b[0]), so this just gets around that.
   poly128_t res;
@@ -45,8 +45,8 @@ static inline poly64x2_t load_dup_p64(const poly64_t *p_in) {
 static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
   // There are two reasons why we can't just use the vaddq_p64 intrinsic:
   // 1. It isn't available on the earliest GCC version we currently support
-  // 2. If GCC recognises that this is an associative operation, then it tries
-  //    to optimise the operation tree in its tree-reassoc pass, but it
+  // 2. If GCC recognizes that this is an associative operation, then it tries
+  //    to optimize the operation tree in its tree-reassoc pass, but it
   //    actually makes the performance much worse. Hiding it in assembly means
   //    that the compiler uses our carefully balanced operation tree instead.
   uint8x16_t res;
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
index 43cd7da..c2c6210 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -76,7 +76,7 @@ armral_status tail_biting_convolutional_decode_block(
   }
 
   uint8_t ro_best_i;
-  uint8_t ro_tb_best_i = states; // Initialised with impossible value
+  uint8_t ro_tb_best_i = states; // Initialized with impossible value
 
   uint8_t iter_cnt = 0;
   uint32x4_t preva_init = {0, 2, 4, 6};
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
index c936f70..58d57d2 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
index 469b252..49ea3fd 100644
--- a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
+++ b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace {
diff --git a/src/UpperPHY/Demodulation/arm_demodulation.c b/src/UpperPHY/Demodulation/arm_demodulation.c
index 2a30828..238abf0 100644
--- a/src/UpperPHY/Demodulation/arm_demodulation.c
+++ b/src/UpperPHY/Demodulation/arm_demodulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -484,7 +484,7 @@ armral_status armral_demodulation(const uint32_t n_symbols, const uint16_t ulp,
                                   armral_modulation_type mod_type,
                                   const armral_cmplx_int16_t *p_src,
                                   int8_t *p_dst) {
-  // If we don't set the return type, it's because the modType isn't recognised.
+  // If we don't set the return type, it's because the modType isn't recognized.
   // Therefore, we have an argument error by default.
   armral_status ret = ARMRAL_ARGUMENT_ERROR;
   switch (mod_type) {
diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp
index 0d4fa9b..33c4576 100644
--- a/src/UpperPHY/LDPC/ldpc_coding.hpp
+++ b/src/UpperPHY/LDPC/ldpc_coding.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/LDPC/ldpc_decoder.cpp b/src/UpperPHY/LDPC/ldpc_decoder.cpp
index 1948000..ba5297f 100644
--- a/src/UpperPHY/LDPC/ldpc_decoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "bit_utils.hpp"
@@ -1382,7 +1382,7 @@ void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
   size_t new_llrs_size = num_llrs;
   std::optional<unique_ptr<Allocator, int8_t>> maybe_out_llrs;
   if (!z_is_tiny) {
-    // Double the storage required to replicate LLRs for optimisation
+    // Double the storage required to replicate LLRs for optimization
     new_llrs_size *= 2;
     // Extra buffer to pack the LLRs again
     maybe_out_llrs = allocate_uninitialized<int8_t>(allocator, num_llrs);
diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp
index 74a8fe4..655d7cb 100644
--- a/src/UpperPHY/LDPC/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp
@@ -1,12 +1,16 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
 #include "ldpc_coding.hpp"
 #include "utils/allocators.hpp"
 
+#ifdef ARMRAL_ARCH_SVE
+#include <arm_sve.h>
+#endif
+
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -81,7 +85,7 @@ const uint32_t bg1_columns[] = {
     1,  6,  10, 67                          // row 45: 4
 };
 
-// The shifts are organised by row, and then by index set. Each line in the
+// The shifts are organized by row, and then by index set. Each line in the
 // following represents the shifts in one index set for one block row of the
 // matrix. Indexing into the array works as follows. If we are using index set k
 // for k in [0, 7], and are on block row i, then the indexing function from k, i
@@ -946,6 +950,70 @@ inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
                                           const uint8_t *parity_hdsm,
                                           const uint8_t *agg_parity,
                                           uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    svuint8_t agg0 = svld1_u8(pg, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result24 = sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg1));
+    svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 22 * z, agg0);
+    svst1_u8(pg, data_out + 23 * z, result23);
+    svst1_u8(pg, data_out + 24 * z, result24);
+    svst1_u8(pg, data_out + 25 * z, result25);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+    svuint8_t agg0 = svld1_u8(pg_tail, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result24 =
+        sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg1));
+    svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 22 * z, agg0);
+    svst1_u8(pg_tail, data_out + 23 * z, result23);
+    svst1_u8(pg_tail, data_out + 24 * z, result24);
+    svst1_u8(pg_tail, data_out + 25 * z, result25);
+  }
+
+  // Process the final row
+  {
+    codeword[(23 * z) - 1] = agg_parity[z - 1];
+    codeword[(24 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(25 * z) - 1] =
+        parity_hdsm[3 * z - 1] ^ parity_hdsm[4 * z - 1] ^ agg_parity[0];
+    codeword[(26 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+#else
   uint8_t *data_out = codeword;
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
@@ -1025,11 +1093,189 @@ inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
         parity_hdsm[3 * z - 1] ^ parity_hdsm[4 * z - 1] ^ agg_parity[0];
     codeword[(26 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
                                       const uint8_t *agg_parity,
                                       uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  if (z == 208) {
+    uint8_t *data_out = codeword;
+    const uint8_t *ptr_agg = agg_parity;
+    const uint8_t *ptr_hdsm = parity_hdsm;
+    // zb = 0 to 104
+    int32_t full_vectors = 105 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg103 = svld1_u8(pg, ptr_agg + 103);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg103);
+      svuint8_t result24 =
+          sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg103));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg103);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg103);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = 105 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg103 = svld1_u8(pg_tail, ptr_agg + 103);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg103);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg103));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg103);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg103);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+    // Process  zb = 105 to 207
+    full_vectors = 103 / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg105 = svld1_u8(pg, ptr_agg - 105);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg105);
+      svuint8_t result24 =
+          sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg105));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg105);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg105);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    tail_size = 103 - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg105 = svld1_u8(pg_tail, ptr_agg - 105);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg105);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg105));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg105);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg105);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+  } else { // z != 208
+
+    // Process the first row of the loop (zb =0)
+    {
+      codeword[22 * z] = agg_parity[z - 1];
+      codeword[23 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+      codeword[24 * z] =
+          parity_hdsm[2 * z] ^ parity_hdsm[3 * z] ^ agg_parity[z - 1];
+      codeword[25 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+    }
+
+    // Process zb = 1 to z
+    uint8_t *data_out = codeword + 1;
+    const uint8_t *ptr_agg = agg_parity + 1;
+    const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+    int32_t full_vectors = (z - 1) / num_lanes;
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      // Load inputs
+      svuint8_t agg1 = svld1_u8(pg, ptr_agg - 1);
+      svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg, hdsm0, agg1);
+      svuint8_t result24 = sveor_u8_x(pg, hdsm2z, sveor_u8_x(pg, hdsm3z, agg1));
+      svuint8_t result25 = sveor_u8_x(pg, hdsm3z, agg1);
+
+      // Store parity bits
+      svst1_u8(pg, data_out + 22 * z, agg1);
+      svst1_u8(pg, data_out + 23 * z, result23);
+      svst1_u8(pg, data_out + 24 * z, result24);
+      svst1_u8(pg, data_out + 25 * z, result25);
+
+      // Increment pointers
+      ptr_agg += num_lanes;
+      ptr_hdsm += num_lanes;
+      data_out += num_lanes;
+    }
+    // Process tail
+    int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg - 1);
+      svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+      svuint8_t hdsm2z = svld1_u8(pg_tail, ptr_hdsm + 2 * z);
+      svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+      // Set the parity bits in the high-density sub-matrix
+      svuint8_t result23 = sveor_u8_x(pg_tail, hdsm0, agg1);
+      svuint8_t result24 =
+          sveor_u8_x(pg_tail, hdsm2z, sveor_u8_x(pg_tail, hdsm3z, agg1));
+      svuint8_t result25 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+      // Store parity bits
+      svst1_u8(pg_tail, data_out + 22 * z, agg1);
+      svst1_u8(pg_tail, data_out + 23 * z, result23);
+      svst1_u8(pg_tail, data_out + 24 * z, result24);
+      svst1_u8(pg_tail, data_out + 25 * z, result25);
+      // Increment pointers
+      ptr_agg += tail_size;
+      ptr_hdsm += tail_size;
+      data_out += tail_size;
+    }
+  }
+#else
   if (z == 208) {
     uint8_t *data_out = codeword;
     const uint8_t *ptr_agg = agg_parity;
@@ -1219,12 +1465,77 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
       codeword[(25 * z) + zb] = parity_hdsm[3 * z + zb] ^ agg_parity[zb - 1];
     }
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
                                                 const uint8_t *parity_hdsm,
                                                 const uint8_t *agg_parity,
                                                 uint8_t *codeword) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  // Process the first row of the loop (zb =0)
+  {
+    codeword[10 * z] = agg_parity[z - 1];
+    codeword[11 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
+    codeword[12 * z] = parity_hdsm[0] ^ parity_hdsm[z] ^ agg_parity[z - 1];
+    codeword[13 * z] = parity_hdsm[3 * z] ^ agg_parity[z - 1];
+  }
+
+  uint8_t *data_out = codeword + 1;
+  const uint8_t *ptr_agg = agg_parity + 1;
+  const uint8_t *ptr_hdsm = parity_hdsm + 1;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    // Load inputs
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg - 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result12 = sveor_u8_x(pg, hdsm0, sveor_u8_x(pg, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 10 * z, agg1);
+    svst1_u8(pg, data_out + 11 * z, result11);
+    svst1_u8(pg, data_out + 12 * z, result12);
+    svst1_u8(pg, data_out + 13 * z, result13);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+
+    // Load inputs
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg - 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg_tail, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result12 =
+        sveor_u8_x(pg_tail, hdsm0, sveor_u8_x(pg_tail, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 10 * z, agg1);
+    svst1_u8(pg_tail, data_out + 11 * z, result11);
+    svst1_u8(pg_tail, data_out + 12 * z, result12);
+    svst1_u8(pg_tail, data_out + 13 * z, result13);
+  }
+#else
   // Deal with the first row of the loop (zb =0)
   {
     codeword[10 * z] = agg_parity[z - 1];
@@ -1301,12 +1612,81 @@ inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
         parity_hdsm[zb] ^ parity_hdsm[z + zb] ^ agg_parity[zb - 1];
     codeword[(13 * z) + zb] = parity_hdsm[3 * z + zb] ^ agg_parity[zb - 1];
   }
+#endif
 }
 
 inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
                                            const uint8_t *parity_hdsm,
                                            const uint8_t *agg_parity,
                                            uint8_t *codeword) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  uint8_t *data_out = codeword;
+  const uint8_t *ptr_agg = agg_parity;
+  const uint8_t *ptr_hdsm = parity_hdsm;
+
+  int32_t full_vectors = (z - 1) / num_lanes;
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    // Load inputs
+    svuint8_t agg0 = svld1_u8(pg, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg, hdsm0, agg1);
+    svuint8_t result12 = sveor_u8_x(pg, hdsm0, sveor_u8_x(pg, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg, data_out + 10 * z, agg0);
+    svst1_u8(pg, data_out + 11 * z, result11);
+    svst1_u8(pg, data_out + 12 * z, result12);
+    svst1_u8(pg, data_out + 13 * z, result13);
+
+    // Increment pointers
+    ptr_agg += num_lanes;
+    ptr_hdsm += num_lanes;
+    data_out += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = (z - 1) - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+
+    // Load inputs
+    svuint8_t agg0 = svld1_u8(pg_tail, ptr_agg);
+    svuint8_t agg1 = svld1_u8(pg_tail, ptr_agg + 1);
+    svuint8_t hdsm0 = svld1_u8(pg_tail, ptr_hdsm);
+    svuint8_t hdsmz = svld1_u8(pg_tail, ptr_hdsm + z);
+    svuint8_t hdsm3z = svld1_u8(pg_tail, ptr_hdsm + 3 * z);
+
+    // Set the parity bits in the high-density sub-matrix
+    svuint8_t result11 = sveor_u8_x(pg_tail, hdsm0, agg1);
+    svuint8_t result12 =
+        sveor_u8_x(pg_tail, hdsm0, sveor_u8_x(pg_tail, hdsmz, agg1));
+    svuint8_t result13 = sveor_u8_x(pg_tail, hdsm3z, agg1);
+
+    // Store parity bits
+    svst1_u8(pg_tail, data_out + 10 * z, agg0);
+    svst1_u8(pg_tail, data_out + 11 * z, result11);
+    svst1_u8(pg_tail, data_out + 12 * z, result12);
+    svst1_u8(pg_tail, data_out + 13 * z, result13);
+  }
+
+  // Process the final row outside of the loop
+  {
+    codeword[(11 * z) - 1] = agg_parity[z - 1];
+    codeword[(12 * z) - 1] = parity_hdsm[z - 1] ^ agg_parity[0];
+    codeword[(13 * z) - 1] =
+        parity_hdsm[z - 1] ^ parity_hdsm[2 * z - 1] ^ agg_parity[0];
+    codeword[(14 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
+  }
+#else
   uint8_t *data_out = codeword;
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
@@ -1388,6 +1768,7 @@ inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
         parity_hdsm[z - 1] ^ parity_hdsm[2 * z - 1] ^ agg_parity[0];
     codeword[(14 * z) - 1] = parity_hdsm[4 * z - 1] ^ agg_parity[0];
   }
+#endif
 }
 
 // Set parity for base graph 1
@@ -1457,6 +1838,78 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
                                   uint8_t *codeword) {
   auto max_ind = graph->nmessage_bits + 4;
 
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  for (uint32_t i = 4; i < graph->nrows; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then the lifting set index
+    // is added to this
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < max_ind; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = codeword + z * (graph->nmessage_bits + i);
+      auto *codeword_ptr = codeword + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      codeword_ptr = codeword + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        codeword_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, codeword_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j == col_entries - 1);
+  }
+#else
   for (uint32_t i = 4; i < graph->nrows; ++i) {
     auto row_start_ind = graph->row_start_inds[i];
     const auto *col_ptr = graph->col_inds + row_start_ind;
@@ -1548,11 +2001,83 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
     // We should have used every column apart from the last one
     assert(j == col_entries - 1);
   }
+#endif
 }
 
 inline void spmv_hdsm(uint32_t z, uint32_t lsi,
                       const armral_ldpc_base_graph_t *graph, uint8_t *bytes_in,
                       uint8_t *parity_hdsm) {
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+
+  for (uint32_t i = 0; i < 4; ++i) {
+    auto row_start_ind = graph->row_start_inds[i];
+    const auto *col_ptr = graph->col_inds + row_start_ind;
+    // Get the number of nonzero entries in the row
+    auto col_entries = graph->row_start_inds[i + 1] - row_start_ind;
+    // The shifts are stored for all index sets, so the pointer
+    // is first offset by the row start index multiplied by
+    // the number of index sets (8), and then
+    const auto *shift_ptr = graph->shifts +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
+                            lsi * col_entries;
+    uint32_t j = 0;
+    for (; j < col_entries && col_ptr[j] < graph->nmessage_bits; ++j) {
+      // Perform the multiplication for each of the rows in the current block
+      auto block_col = col_ptr[j];
+      // Shift the first row by the appropriate amount, and then
+      // wrap around when we reach the block size
+      auto shift = shift_ptr[j] % z;
+      auto *out_ptr = parity_hdsm + z * i;
+      auto *in_ptr = bytes_in + block_col * z + shift;
+
+      // Process last (z - shift) elts
+      int32_t full_vectors = (z - shift) / num_lanes;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, in_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      int32_t tail_size = (z - shift) - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+        out_ptr += tail_size;
+      }
+
+      // Process first shift elts
+      full_vectors = shift / num_lanes;
+      in_ptr = bytes_in + block_col * z;
+      for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+        svuint8_t reg1 = svld1_u8(pg, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg, in_ptr);
+        svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+        svst1_u8(pg, out_ptr, result);
+        out_ptr += num_lanes;
+        in_ptr += num_lanes;
+      }
+      // Process tail
+      tail_size = shift - (full_vectors * num_lanes);
+      if (tail_size != 0) {
+        svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+        svuint8_t reg1 = svld1_u8(pg_tail, out_ptr);
+        svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+        svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+        svst1_u8(pg_tail, out_ptr, result);
+      }
+    }
+    // We should have used every column apart from the last one
+    assert(j < col_entries && col_ptr[j] >= graph->nmessage_bits);
+  }
+#else
   for (uint32_t i = 0; i < 4; ++i) {
     auto row_start_ind = graph->row_start_inds[i];
     const auto *col_ptr = graph->col_inds + row_start_ind;
@@ -1642,11 +2167,37 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi,
     }
     assert(j < col_entries && col_ptr[j] >= graph->nmessage_bits);
   }
+#endif
 }
 
 inline void copy_input_message(uint32_t z,
                                const armral_ldpc_base_graph_t *graph,
                                const uint8_t *bytes_in, uint8_t *codeword) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+  int32_t full_vectors = z / num_lanes;
+  int32_t tail_size = z - (full_vectors * num_lanes);
+
+  for (uint32_t j = 0; j < graph->nmessage_bits; ++j) {
+    uint8_t *out_ptr = codeword + j * z;
+    const uint8_t *in_ptr = bytes_in + j * z;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      svuint8_t reg = svld1_u8(pg, in_ptr);
+      svst1_u8(pg, out_ptr, reg);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      svuint8_t reg = svld1_u8(pg_tail, in_ptr);
+      svst1_u8(pg_tail, out_ptr, reg);
+    }
+  }
+#else
   for (uint32_t j = 0; j < graph->nmessage_bits; ++j) {
 
     uint8_t *out_ptr = codeword + j * z;
@@ -1679,10 +2230,60 @@ inline void copy_input_message(uint32_t z,
       codeword[j * z + zb] = bytes_in[j * z + zb];
     }
   }
+#endif
 }
 
 inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
                           uint8_t *tmp_parity) {
+
+#if ARMRAL_ARCH_SVE
+  int32_t num_lanes = svcntb();
+  svbool_t pg = svptrue_b8();
+  int32_t full_vectors = z / num_lanes;
+
+  // First iteration, tmp_parity is vector of 0
+  uint8_t *out_ptr = tmp_parity;
+  const uint8_t *in_ptr = parity_hdsm;
+  svuint8_t reg1 = svdup_n_u8(0);
+  for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+    svuint8_t reg2 = svld1_u8(pg, in_ptr);
+    svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+    svst1_u8(pg, out_ptr, result);
+    out_ptr += num_lanes;
+    in_ptr += num_lanes;
+  }
+  // Process tail
+  int32_t tail_size = z - (full_vectors * num_lanes);
+  if (tail_size != 0) {
+    svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+    svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+    svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+    svst1_u8(pg_tail, out_ptr, result);
+  }
+
+  // Iteration 1 to 3
+  for (uint32_t j = 1; j < 4; ++j) {
+    out_ptr = tmp_parity;
+    in_ptr = parity_hdsm + z * j;
+
+    for (int32_t vec_idx = 0; vec_idx < full_vectors; ++vec_idx) {
+      reg1 = svld1_u8(pg, out_ptr);
+      svuint8_t reg2 = svld1_u8(pg, in_ptr);
+      svuint8_t result = sveor_u8_x(pg, reg1, reg2);
+      svst1_u8(pg, out_ptr, result);
+      out_ptr += num_lanes;
+      in_ptr += num_lanes;
+    }
+    // Process tail
+    if (tail_size != 0) {
+      svbool_t pg_tail = svwhilelt_b8(0, tail_size);
+      reg1 = svld1_u8(pg_tail, out_ptr);
+      svuint8_t reg2 = svld1_u8(pg_tail, in_ptr);
+      svuint8_t result = sveor_u8_x(pg_tail, reg1, reg2);
+      svst1_u8(pg_tail, out_ptr, result);
+    }
+  }
+#else
   // First iteration, tmp_parity is vector of 0
   uint8_t *out_ptr = tmp_parity;
   const uint8_t *in_ptr = parity_hdsm;
@@ -1752,6 +2353,7 @@ inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
       tmp_parity[zb] ^= parity_hdsm[j * z + zb];
     }
   }
+#endif
 }
 
 template<typename Allocator>
diff --git a/src/UpperPHY/LDPC/ldpc_rate_common.hpp b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
index 3858f49..e4037ea 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_common.hpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
index 2324c2f..9ab6760 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
index 6fa9b6c..206e654 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "ldpc_rate_common.hpp"
diff --git a/src/UpperPHY/Modulation/arm_modulation.c b/src/UpperPHY/Modulation/arm_modulation.c
index 96c91bb..242acb4 100644
--- a/src/UpperPHY/Modulation/arm_modulation.c
+++ b/src/UpperPHY/Modulation/arm_modulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
index 3925063..341b4b0 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_check.cpp b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
index f885073..b0682cd 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.cpp b/src/UpperPHY/Polar/arm_polar_decoder.cpp
index 4c4cc8d..b1db620 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder.cpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -38,7 +38,7 @@ inline void g_l(const int8_t *in, const uint8_t *dec, const uint8_t *hist,
   // g(a_h, b_h, c_i=1) = a_h - b_h
   // This matches the non-list version, but for L > 1 we need to take care of
   // permuting the input beliefs by the list history value rather than simply
-  // vectorising the beliefs directly.
+  // vectorizing the beliefs directly.
   if constexpr (L > 1) {
     g_l_impl<Nhalf, L>::g_l(in, dec, hist, out);
   } else {
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.hpp b/src/UpperPHY/Polar/arm_polar_decoder.hpp
index ef2091c..7989bac 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder.hpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
index 249f2e0..fb20d2e 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
@@ -130,7 +130,7 @@ template<>
 struct g_l_impl<2, 4> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=4).
+    // specialized N=2-byte chunks interleaved (times L=4).
     uint8x8_t h8 = vld_hist_l4(hist);
     uint8x8_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4};
     g_l_x8(in, dec, h8, xs_idx, out);
@@ -141,7 +141,7 @@ template<>
 struct g_l_impl<2, 8> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=8).
+    // specialized N=2-byte chunks interleaved (times L=8).
     uint8x16_t h8 = vld_histq_l8(hist);
     uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
     g_l_x16(in, dec, h8, xs_idx, out);
@@ -162,7 +162,7 @@ template<>
 struct g_l_impl<4, 4> {
   static inline void g_l(const int8_t *in, const uint8_t *dec,
                          const uint8_t *hist, int8_t *out) {
-    // specialised N=4-byte chunks interleaved (times L=4).
+    // specialized N=4-byte chunks interleaved (times L=4).
     uint8x16_t h8 = vld_histq_l4(hist);
     uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
     g_l_x16(in, dec, h8, xs_idx, out);
diff --git a/src/UpperPHY/Polar/arm_polar_encoder.c b/src/UpperPHY/Polar/arm_polar_encoder.c
index cd74125..5936f57 100644
--- a/src/UpperPHY/Polar/arm_polar_encoder.c
+++ b/src/UpperPHY/Polar/arm_polar_encoder.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
index 6887a74..6c5fe66 100644
--- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -20,7 +20,7 @@ constexpr frozen_arrays<N> fa;
 // reliability sequences for particular polar code sizes, derived from table
 // 5.3.1.2-1 in 3GPP TS 38.212. These are stored in reverse order, so go from
 // most reliable bit to least reliable. This is so that we can iterate forwards
-// through these, which makes vectorisation easier.
+// through these, which makes vectorization easier.
 constexpr uint16_t q32[] = {31, 30, 29, 27, 23, 15, 28, 22, 25, 26, 21,
                             14, 13, 19, 11, 7,  24, 20, 12, 18, 10, 17,
                             6,  9,  5,  3,  16, 8,  4,  2,  1,  0};
diff --git a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
index 2d6ca39..dbf884c 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
index d1c6ff5..06c903e 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
index 8d0a317..6b1b86a 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
index d450c19..192d339 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
index f0935c5..f2415fd 100644
--- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -8,546 +8,11 @@
 #include "turbo_tables.hpp"
 #include "utils/allocators.hpp"
 
-#include <cmath>
-#include <cstdlib>
-
-namespace {
-
-// With Turbo codes n (=k) is always divisible by 8 so we
-// do not have to worry about tail bits
-inline void turbo_llrs_to_bits(uint32_t n, const float32x4_t *llr,
-                               uint8_t *data_out) {
-  uint32_t full_bytes = n >> 3;
-  constexpr uint32x4_t ones_0 = {128, 64, 32, 16};
-  constexpr uint32x4_t ones_1 = {8, 4, 2, 1};
-
-  for (uint32_t i = 0; i < full_bytes; ++i) {
-    // The first bit to write in the byte is the most significant
-    uint32x4_t pred_0 = vcltzq_f32(llr[i * 2]);
-    uint32x4_t pred_1 = vcltzq_f32(llr[i * 2 + 1]);
-    uint32x4_t mask_0 = vandq_u32(pred_0, ones_0);
-    uint32x4_t mask_1 = vandq_u32(pred_1, ones_1);
-    uint32x4_t mask_2 = vorrq_u32(mask_0, mask_1);
-    data_out[i] = (uint8_t)vaddvq_u32(mask_2);
-  }
-}
-
-// Take the input int8_t LLRs and convert them to float32x4_ts
-inline void convert_llrs(uint32_t k, const int8_t *llrs,
-                         float32x4_t *llrs_f32) {
-  constexpr int8x16_t idx_0 = {127, 127, 127, 0, 127, 127, 127, 1,
-                               127, 127, 127, 2, 127, 127, 127, 3};
-  constexpr int8x16_t idx_1 = {127, 127, 127, 4, 127, 127, 127, 5,
-                               127, 127, 127, 6, 127, 127, 127, 7};
-  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
-  for (uint32_t i = 0, j = 0; i < k; i += 8, j += 2) {
-    int8x8_t data = vld1_s8(&llrs[i]);
-    int32x4_t ldata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_0));
-    int32x4_t hdata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_1));
-    llrs_f32[j] = vcvtq_n_f32_s32(ldata, 24);
-    llrs_f32[j + 1] = vcvtq_n_f32_s32(hdata, 24);
-  }
-}
-
-// Calculate the PDF of the state transition probability on the assumption that
-// we are operating on an AWGN channel:
-//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
-// In our implementation we assume the channel reliability, l_c,
-// has been prescaled by 1/2 to avoid doing so repeatedly here.
-template<int32_t x1, int32_t x2, bool use_extrinsic>
-inline float32x4_t transition_pdf(float32x4_t l_uk, float32x4_t l_c,
-                                  float32x4_t y1, float32x4_t y2) {
-  if constexpr (use_extrinsic) {
-    float32x4_t term1 =
-        vmulq_n_f32(vfmaq_f32(vmulq_n_f32(l_uk, 0.5F), l_c, y1), x1);
-    float32x4_t term2 = vmulq_f32(vmulq_n_f32(l_c, (float32_t)x2), y2);
-    return vaddq_f32(term1, term2);
-  } else {
-    return vmulq_f32(l_c, vaddq_f32(vmulq_n_f32(y1, (float32_t)x1),
-                                    vmulq_n_f32(y2, (float32_t)x2)));
-  }
-}
-
-// Update the extrinsic information output from the decoding stage
-// based on the computed LLRs, the old extrinsic information and the input.
-inline void update_extrinsic(uint32_t len, const float32x4_t *llr,
-                             float32x4_t *extrinsic, const float32x4_t *input) {
-  for (uint32_t i = 0; i < len; i++) {
-    extrinsic[i] = vsubq_f32(vsubq_f32(llr[i], extrinsic[i]), input[i]);
-  }
-}
-
-// Calculate the trellis termination values. These are independent of the
-// extrinsic information and so can be done once without needing to be updated
-// on every iteration.
-void trellis_termination(const float32x4_t *sys, const float32x4_t *par,
-                         uint32_t k4, float32x4_t l_c, float32x4_t *beta_out) {
-  // We handle the gammas for the trellis termination bits separately
-  // as the state transitions are different. The x_{kl} are never 1
-  // here, because we always use inputs of 0 to drive the trellis back
-  // to state 0 in the encoder, so we only need to consider a smaller
-  // number of state transitions. We also do not have any extrinsic
-  // information. Because some of the gamma terms will
-  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
-  // adding to them to beta or taking the max with them, compared with
-  // when we calculate beta in the main calculations.
-  float32x4_t unused_extrinsic = {0};
-  float32x4_t pdf_00 =
-      transition_pdf<1, 1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
-  float32x4_t pdf_01 =
-      transition_pdf<1, -1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
-
-  // We need  b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]}
-  float32x4_t pdf_uzp1 = vuzp1q_f32(pdf_00, pdf_01);
-  float32x4_t b01 = vtrn2q_f32(pdf_uzp1, pdf_uzp1);
-
-  // We need  g01_02 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
-  float32x4_t pdf_uzp2 = vuzp2q_f32(pdf_00, pdf_01);
-  float32x4_t g01_02 = vuzp1q_f32(pdf_uzp2, pdf_uzp2);
-
-  float32x4_t beta_term = vaddq_f32(g01_02, b01);
-
-  // We need  g01_02_1 = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0]};
-  float32x4_t g01_02_1 = vuzp1q_f32(pdf_uzp1, pdf_uzp1);
-
-  // We need  b01_1 = {beta_term[0], beta_term[0], beta_term[1], beta_term[1]};
-  float32x4_t b01_1 = vzip1q_f32(beta_term, beta_term);
-  beta_out[0] = vaddq_f32(g01_02_1, b01_1);
-
-  // We need  g23_02_1 = {pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
-  float32x4_t g23_02_1 = vrev64q_f32(g01_02_1);
-
-  // We need  b23_1 = {beta_term[2], beta_term[2], beta_term[3], beta_term[3]};
-  float32x4_t b23_1 = vzip2q_f32(beta_term, beta_term);
-  beta_out[1] = vaddq_f32(g23_02_1, b23_1);
-}
-
-// A single max-log-MAP decoder that works on an array of systematic bits (sys),
-// an array of parity bits (par), and an array of extrinsic values from a
-// previous decoding stage (extrinsic)
-void decode_step(const float32x4_t *sys, const float32x4_t *par,
-                 const float32x4_t *extrinsic, uint32_t k4, float32x4_t *llr,
-                 float32x4_t *alpha, const float32x4_t *beta_tail,
-                 float32x4x4_t *pdf4, float32x4_t l_c) {
-  uint32_t k_idx;
-  uint32_t kp1_idx;
-
-  constexpr uint8x16_t rev_idx = {12, 13, 14, 15, 8, 9, 10, 11,
-                                  4,  5,  6,  7,  0, 1, 2,  3};
-
-  // Start by computing the non-zero conditional state transition probabilities
-  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
-  // an AWGN channel (ignoring extrinsic information in l_uk):
-  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
-  // Here there are only 2 possible state transitions into each state
-  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
-  // terms.
-  for (uint32_t i = 0; i < k4; i++) {
-    // The x_{kl} values are the actual systematic and parity values that
-    // would result from the encoder having transited from state s' to s.
-    // They can only ever be either 0 or 1 so we precompute the four possible
-    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
-    // that these 0s and 1s have to be converted to 1s and -1s to match the
-    // values in y
-    //
-    // The y_{kl} values are the observed the systematic and parity inputs.
-    // These have potentially been perturbed by noise on the channel
-    //
-    // Although each of the 8 states of the encoder has in theory 8
-    // predecessor states, the encoder's structure means that not all state
-    // transitions are possible. Each state actually only has 2 predecessor
-    // states so we only have to compute 16 non-zero values for each input
-    // LLR.
-    float32x4_t pdf_00 =
-        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_10 =
-        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_01 =
-        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
-    float32x4_t pdf_11 =
-        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
-
-    // There is considerable duplication in the values we could store. For
-    // example, for a single state the 16 gamma values are:
-    //
-    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
-    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
-    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
-    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
-    //
-    // We therefore choose to store the 4 unique pdf values (using st4)
-    // as this allows us to access the pdf values contiguously in the
-    // calculations needed for the alpha and beta values
-    vst4q_f32((float32_t *)&pdf4[i],
-              float32x4x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
-
-    // Accumulate the state transition probabilities forwards through the
-    // state transition trellis starting from the known encoder start state 0
-    for (uint32_t j = 0; j < 4; j++) {
-      k_idx = 8 * i + j * 2;
-      kp1_idx = k_idx + 2;
-
-      // We need  g0 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
-      //                gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0]};
-      //         a02 = {alpha[k_idx][0], alpha[k_idx][2],
-      //                alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
-      float32x4_t g0 = pdf4[i].val[j];
-      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t left_1 = vaddq_f32(g0, a02);
-      // We need  g2 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
-      //                gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
-      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
-      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
-      float32x4_t g2 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), rev_idx));
-      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t right_1 = vaddq_f32(g2, a13);
-      alpha[kp1_idx] = vmaxq_f32(left_1, right_1);
-
-      // We need  g1 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][1],
-      //                gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][1]};
-      // which is g2 above
-      float32x4_t left_2 = vaddq_f32(g2, a02);
-      // We need  g3 = {gamma[g_k_idx][3], gamma[g_k_idx + 1][3],
-      //                gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][3]};
-      // which is g0 above
-      float32x4_t right_2 = vaddq_f32(g0, a13);
-      alpha[kp1_idx + 1] = vmaxq_f32(left_2, right_2);
-    }
-  }
-
-  // Accumulate the state transition probabilities backwards through the state
-  // transition trellis starting from the beginning of the precomputed tail
-  // and calculate the conditional probabilities of each bit being either 0
-  // or 1
-  constexpr uint8x16_t idx_0312 = {0, 1, 2, 3, 12, 13, 14, 15,
-                                   4, 5, 6, 7, 8,  9,  10, 11};
-  constexpr uint8x16_t idx_3021 = {12, 13, 14, 15, 0, 1, 2, 3,
-                                   8,  9,  10, 11, 4, 5, 6, 7};
-  constexpr uint8x16_t idx_2130 = {8,  9,  10, 11, 4, 5, 6, 7,
-                                   12, 13, 14, 15, 0, 1, 2, 3};
-  constexpr uint8x16_t idx_1203 = {4, 5, 6, 7, 8,  9,  10, 11,
-                                   0, 1, 2, 3, 12, 13, 14, 15};
-  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
-                                   8, 9, 10, 11, 0, 1, 2,  3};
-  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
-                                   4,  5,  6,  7,  12, 13, 14, 15};
-
-  float32x4x2_t beta_k;
-  float32x4x2_t beta_kp1 = {beta_tail[0], beta_tail[1]};
-
-  for (int32_t i = k4 - 1; i >= 0; i--) {
-    float32x4_t prob_0;
-    float32x4_t prob_1;
-    for (int32_t j = 3; j >= 0; j--) {
-      k_idx = 8 * i + j * 2;
-
-      // We need  g01_02 = {gamma[g_k_idx][0], gamma[g_k_idx][2],
-      //                    gamma[g_k_idx + 1][0], gamma[g_k_idx + 1][2]};
-      //          b01 = {beta[b_kp1_idx][0], beta[b_kp1_idx][0],
-      //                 beta[b_kp1_idx][1], beta[b_kp1_idx][1]};
-      float32x4_t g01_02 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0312));
-      float32x4_t b01 = vzip1q_f32(beta_kp1.val[0], beta_kp1.val[0]);
-      float32x4_t left_1 = vaddq_f32(g01_02, b01);
-
-      // We need  g13 = {gamma[g_k_idx][1], gamma[g_k_idx][3],
-      //                 gamma[g_k_idx + 1][1], gamma[g_k_idx + 1][3]};
-      //          bp1_01 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx + 1][0],
-      //                    beta[b_kp1_idx + 1][1], beta[b_kp1_idx + 1][1]};
-      float32x4_t g13 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3021));
-      float32x4_t bp1_01 = vzip1q_f32(beta_kp1.val[1], beta_kp1.val[1]);
-      float32x4_t right_1 = vaddq_f32(g13, bp1_01);
-      beta_k.val[0] = vmaxq_f32(left_1, right_1);
-
-      // We need  g23_02 = {gamma[g_k_idx + 2][0], gamma[g_k_idx + 2][2],
-      //                    gamma[g_k_idx + 3][0], gamma[g_k_idx + 3][2]};
-      // We need  b23 = {beta[b_kp1_idx][2], beta[b_kp1_idx][2],
-      //                 beta[b_kp1_idx][3], beta[b_kp1_idx][3]};
-      float32x4_t g23_02 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_2130));
-      float32x4_t b23 = vzip2q_f32(beta_kp1.val[0], beta_kp1.val[0]);
-      float32x4_t left_2 = vaddq_f32(g23_02, b23);
-
-      // We need  g23_13 = {gamma[g_k_idx + 2][1], gamma[g_k_idx + 2][3],
-      //                    gamma[g_k_idx + 3][1], gamma[g_k_idx + 3][3]};
-      //          bp1_23 = {beta[b_kp1_idx + 1][2], beta[b_kp1_idx + 1][2],
-      //                    beta[b_kp1_idx + 1][3], beta[b_kp1_idx + 1][3]};
-      float32x4_t g23_13 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_1203));
-      float32x4_t bp1_23 = vzip2q_f32(beta_kp1.val[1], beta_kp1.val[1]);
-      float32x4_t right_2 = vaddq_f32(g23_13, bp1_23);
-      beta_k.val[1] = vmaxq_f32(left_2, right_2);
-
-      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
-      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
-      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
-      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
-      //       b02_13 = {beta[b_kp1_idx][0], beta[b_kp1_idx + 1][1],
-      //                 beta[b_kp1_idx][2], beta[b_kp1_idx + 1][3]};
-      //       b13_02 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx][1],
-      //                 beta[b_kp1_idx + 1][2], beta[b_kp1_idx][3]};
-      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
-      float32x4_t b02_13 =
-          vtrn2q_f32(vrev64q_f32(beta_kp1.val[0]), beta_kp1.val[1]);
-      float32x4_t b13_02 =
-          vtrn2q_f32(vrev64q_f32(beta_kp1.val[1]), beta_kp1.val[0]);
-
-      // Find the most probable path in which bit i was a 0
-      // We need  g01_01 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][1],
-      //                   gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][1]};
-      float32x4_t g01_01 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0220));
-      left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
-      right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
-      prob_0[j] = vmaxvq_f32(vmaxq_f32(left_1, right_1));
-
-      // Find the most probable path in which bit i was a 1
-      // We need  g10_10 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][0],
-      //                   gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][0]};
-      float32x4_t g10_10 = vreinterpretq_f32_u8(
-          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3113));
-      left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
-      right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
-      prob_1[j] = vmaxvq_f32(vmaxq_f32(left_2, right_2));
-
-      // Store the current value of beta to use in the next
-      // round of calculations
-      beta_kp1 = beta_k;
-    }
-
-    // Calculate the LLRs
-    llr[i] = vsubq_f32(prob_0, prob_1);
-  }
-}
-
-} // namespace
-
-// The template parameter allows us to disable checking for convergence (and
-// thus terminating the iterations early) so we always run a fixed number of
-// iterations in our benchmarking
-template<bool check_convergence, typename Allocator>
-void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
-                                 const int8_t *itl, uint32_t k, uint8_t *dst,
-                                 float32_t l_c, uint32_t max_iter,
-                                 Allocator &allocator) {
-  // This implements multiple steps of the max-log-MAP algorithm,
-  // which is an approximation to the MAP (BCJR) algorithm.
-  // It returns a hard decision rather than raw LLRs
-
-  // We will be working with float32x4_t, so work out how
-  // many of these will be needed to store k float32_ts.
-  // k is always a multiple of 8, so no need to worry about remainders.
-  uint32_t k4 = k >> 2;
-
-  auto sys_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-  auto par_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-  auto itl_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-
-  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
-  auto perm_sys = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
-
-  struct perm_pair {
-    uint16_t first;
-    uint16_t second;
-  };
-
-  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
-
-  // Allocate space to hold the extrinsic and permuted extrinsic information
-  // to be passed between the two decoders. Extrinsic is initially set to 0.
-  auto extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
-  auto perm_extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
-
-  // Allocate space for log likelihood ratios from both stages of decoding
-  auto l1_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
-  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
-  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k4);
-
-  // Allocate space to hold alpha and gamma
-  // alpha stores the forward-accumulated state probabilities for each decoded
-  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
-  // plus the starting condition
-  auto alpha = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
-  // gamma stores the conditional state transition probabilities for each of the
-  // k+3 bits to decode
-  auto gamma = allocate_uninitialized<float32x4x4_t>(allocator, k4);
-
-  // NOTE: All allocations done.
-  if constexpr (Allocator::is_counting) {
-    return;
-  }
-
-  // Convert our LLRs from int8_ts into float32_ts
-  convert_llrs(k, sys, sys_f32.get());
-  convert_llrs(k, par, par_f32.get());
-  convert_llrs(k, itl, itl_f32.get());
-
-  // Unperturb the trellis termination bits. They are transmitted as:
-  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
-  // Z'2
-  // but need to appended to the inputs as:
-  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
-  // Z'2
-  // We append to the systematic (X), the parity (Z) and the interleaved parity
-  // (Z') values here, and to the interleaved systematic values (X') further
-  // down.
-  sys_f32[k4][0] = (float32_t)sys[k];
-  sys_f32[k4][1] = (float32_t)itl[k];
-  sys_f32[k4][2] = (float32_t)par[k + 1];
-
-  par_f32[k4][0] = (float32_t)par[k];
-  par_f32[k4][1] = (float32_t)sys[k + 1];
-  par_f32[k4][2] = (float32_t)itl[k + 1];
-
-  itl_f32[k4][0] = (float32_t)par[k + 2];
-  itl_f32[k4][1] = (float32_t)sys[k + 3];
-  itl_f32[k4][2] = (float32_t)itl[k + 3];
-
-  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
-  const float32x4_t channel_reliability = vdupq_n_f32(l_c / 2);
-
-  // Generate the permutation vector for the input value of k
-  // Find the index into the array of parameter arrays corresponding
-  // to the current k. Subtract 40 because k=40 is the lowest value.
-  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
-  // and extract the correct values of f1 and f2 to build the
-  // interleaving polynomial
-  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
-  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
-  for (uint32_t i = 0; i < k; i++) {
-    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
-  }
-
-  // Create a permuted version of the systematic output for use
-  // with the second decoder
-  for (uint32_t i = 0; i < k4; i++) {
-    perm_sys[i][0] = (float32_t)sys[perm_idx[(i * 4) + 0]];
-    perm_sys[i][1] = (float32_t)sys[perm_idx[(i * 4) + 1]];
-    perm_sys[i][2] = (float32_t)sys[perm_idx[(i * 4) + 2]];
-    perm_sys[i][3] = (float32_t)sys[perm_idx[(i * 4) + 3]];
-  }
-  perm_sys[k4][0] = (float32_t)sys[k + 2];
-  perm_sys[k4][1] = (float32_t)itl[k + 2];
-  perm_sys[k4][2] = (float32_t)par[k + 3];
-
-  // Create a look-up of the permutation vector that maps [0,...k-1] indices
-  // to vector element/vector lane pairs. This avoids having to a modulo
-  // operator every time we want to apply the permutation to vector elements
-  for (uint32_t i = 0; i < k; i++) {
-    uint16_t vec_idx = perm_idx[i] / 4;
-    uint16_t vec_lane = perm_idx[i] % 4;
-    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
-  }
-
-  // Separate arrays to hold the betas of the trellis termination bits for the
-  // original and permuted inputs
-  float32x4_t beta_tail[2];
-  float32x4_t perm_beta_tail[2];
-
-  // Initialise alpha
-  alpha[0] = vdupq_n_f32(-INFINITY);
-  alpha[1] = vdupq_n_f32(-INFINITY);
-  alpha[0][0] = 0;
-
-  // Calculate the trellis termination state transition probabilities, which
-  // do not require any extrinsic information
-  trellis_termination(sys_f32.get(), par_f32.get(), k4, channel_reliability,
-                      beta_tail);
-  trellis_termination(perm_sys.get(), itl_f32.get(), k4, channel_reliability,
-                      perm_beta_tail);
-
-  // Initialise the number of iterations
-  uint32_t num_iter = 0;
-
-  while (num_iter < max_iter) {
-    // Run the first decoder step
-    decode_step(sys_f32.get(), par_f32.get(), extrinsic.get(), k4, l1_uky.get(),
-                alpha.get(), beta_tail, gamma.get(), channel_reliability);
-
-    // Compute the new extrinsic information to pass into the second decoder
-    update_extrinsic(k4, l1_uky.get(), extrinsic.get(), sys_f32.get());
-
-    // Need to unpermute extrinsic to match input to second decoder
-    for (uint32_t i = 0; i < k4; i++) {
-      perm_extrinsic[i][0] =
-          extrinsic[perm_lookup[i * 4].first][perm_lookup[i * 4].second];
-      perm_extrinsic[i][1] = extrinsic[perm_lookup[i * 4 + 1].first]
-                                      [perm_lookup[i * 4 + 1].second];
-      perm_extrinsic[i][2] = extrinsic[perm_lookup[i * 4 + 2].first]
-                                      [perm_lookup[i * 4 + 2].second];
-      perm_extrinsic[i][3] = extrinsic[perm_lookup[i * 4 + 3].first]
-                                      [perm_lookup[i * 4 + 3].second];
-    }
-
-    // Run the second decoder step
-    decode_step(perm_sys.get(), itl_f32.get(), perm_extrinsic.get(), k4,
-                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
-                channel_reliability);
-
-    // Compute the new extrinsic information to pass back into the first encoder
-    update_extrinsic(k4, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
-
-    // But need to unpermute extrinsic first
-    for (uint32_t i = 0; i < k4; i++) {
-      extrinsic[perm_lookup[i * 4].first][perm_lookup[i * 4].second] =
-          perm_extrinsic[i][0];
-      extrinsic[perm_lookup[i * 4 + 1].first][perm_lookup[i * 4 + 1].second] =
-          perm_extrinsic[i][1];
-      extrinsic[perm_lookup[i * 4 + 2].first][perm_lookup[i * 4 + 2].second] =
-          perm_extrinsic[i][2];
-      extrinsic[perm_lookup[i * 4 + 3].first][perm_lookup[i * 4 + 3].second] =
-          perm_extrinsic[i][3];
-    }
-
-    // Compare this iteration's results with those from the previous iteration
-    float32_t max_abs_diff = 0.0;
-    float32_t max_abs_val = 0.0;
-    for (uint32_t i = 0; i < k4; i++) {
-      float32_t abs_diff = vmaxvq_f32(vabdq_f32(l2_uky[i], prev_l2_uky[i]));
-      float32_t abs_val = vmaxvq_f32(vabsq_f32(l2_uky[i]));
-      if (abs_diff > max_abs_diff) {
-        max_abs_diff = abs_diff;
-      }
-      if (abs_val > max_abs_val) {
-        max_abs_val = abs_val;
-      }
-    }
-
-    // If we've converged, finish decoding
-    if constexpr (check_convergence) {
-      if (max_abs_diff / max_abs_val <
-          std::numeric_limits<float32_t>::epsilon()) {
-        break;
-      }
-    }
-
-    // Store the current "final" LLRs to use in convergence checking next
-    // iteration
-    for (uint32_t i = 0; i < k4; i++) {
-      prev_l2_uky[i] = l2_uky[i];
-    }
-
-    num_iter++;
-  }
-
-  // Return unpermuted final output from second encoder
-  // Rather than allocate another new vector, copy into l1_uky and return that
-  for (uint32_t i = 0; i < k4; i++) {
-    l1_uky[perm_lookup[i * 4].first][perm_lookup[i * 4].second] = l2_uky[i][0];
-    l1_uky[perm_lookup[i * 4 + 1].first][perm_lookup[i * 4 + 1].second] =
-        l2_uky[i][1];
-    l1_uky[perm_lookup[i * 4 + 2].first][perm_lookup[i * 4 + 2].second] =
-        l2_uky[i][2];
-    l1_uky[perm_lookup[i * 4 + 3].first][perm_lookup[i * 4 + 3].second] =
-        l2_uky[i][3];
-  }
-
-  // Make a hard decision based on the final LLRs
-  turbo_llrs_to_bits(k, l1_uky.get(), dst);
-}
+#ifdef ARMRAL_ARCH_SVE
+#include "turbo_decoder_fp16.hpp"
+#else
+#include "turbo_decoder_fp32.hpp"
+#endif
 
 template void armral::turbo::decode_block<false, heap_allocator>(
     const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k,
@@ -571,7 +36,7 @@ static armral_status turbo_decode_block(const int8_t *sys, const int8_t *par,
   // itself relative to the channel SNR). For reference see:
   //  N. Wehn, "Turbo-decoding without SNR estimation", IEEE Communications
   //  Letters 4(6), pp. 193-195, July 2000.
-  armral::turbo::decode_block<true>(sys, par, itl, k, dst, 2.0, max_iter,
+  armral::turbo::decode_block<true>(sys, par, itl, k, dst, 2.F, max_iter,
                                     allocator);
   return ARMRAL_SUCCESS;
 }
diff --git a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
index 62acc61..896519e 100644
--- a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
index 7048c97..cd41869 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_tables.hpp"
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
index 416cd38..6f00ac7 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/src/UpperPHY/Turbo/turbo_code.hpp b/src/UpperPHY/Turbo/turbo_code.hpp
index f389fed..2dd3ff2 100644
--- a/src/UpperPHY/Turbo/turbo_code.hpp
+++ b/src/UpperPHY/Turbo/turbo_code.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp b/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp
new file mode 100644
index 0000000..b924436
--- /dev/null
+++ b/src/UpperPHY/Turbo/turbo_decoder_fp16.hpp
@@ -0,0 +1,520 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <cmath>
+
+namespace {
+
+struct float16x4x8_t {
+  float16x4_t val[8];
+};
+
+// With Turbo codes n (=k) is always divisible by 8 so we
+// do not have to worry about tail bits
+inline void turbo_llrs_to_bits(uint32_t n, const float16x8_t *llr,
+                               uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  constexpr uint16x8_t ones = {128, 64, 32, 16, 8, 4, 2, 1};
+
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    // The first bit to write in the byte is the most significant
+    uint16x8_t pred = vcltzq_f16(llr[i]);
+    uint16x8_t mask = vandq_u16(pred, ones);
+    data_out[i] = (uint8_t)vaddvq_u16(mask);
+  }
+}
+
+// Take the input int8_t LLRs and convert them to float16x8_ts
+inline void convert_llrs(uint32_t k, const int8_t *llrs,
+                         float16x8_t *llrs_f16) {
+  constexpr int8x16_t idx = {127, 0, 127, 1, 127, 2, 127, 3,
+                             127, 4, 127, 5, 127, 6, 127, 7};
+  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
+  for (uint32_t i = 0, j = 0; i < k; i += 8, j++) {
+    int8x8_t data = vld1_s8(&llrs[i]);
+    int16x8_t data_i16 = vreinterpretq_s16_s8(vtbl1q_s8(data, idx));
+    llrs_f16[j] = vcvtq_n_f16_s16(data_i16, 8);
+  }
+}
+
+// Calculate the PDF of the state transition probability on the assumption that
+// we are operating on an AWGN channel:
+//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
+// In our implementation we assume the channel reliability, l_c,
+// has been prescaled by 1/2 to avoid doing so repeatedly here.
+template<int32_t x1, int32_t x2>
+inline float16x4_t transition_pdf(float16x8_t l_c, float16x8_t y1,
+                                  float16x8_t y2) {
+  return vget_low_f16(
+      vmulq_f16(l_c, vaddq_f16(vmulq_n_f16(y1, (float16_t)x1),
+                               vmulq_n_f16(y2, (float16_t)x2))));
+}
+
+template<int32_t x1, int32_t x2, bool use_extrinsic>
+inline float16x8_t transition_pdf(float16x8_t l_uk, float16x8_t l_c,
+                                  float16x8_t y1, float16x8_t y2) {
+  if constexpr (use_extrinsic) {
+    float16x8_t term1 =
+        vmulq_n_f16(vfmaq_f16(vmulq_n_f16(l_uk, 0.5F), l_c, y1), x1);
+    float16x8_t term2 = vmulq_f16(vmulq_n_f16(l_c, (float16_t)x2), y2);
+    return vaddq_f16(term1, term2);
+  } else {
+    return vmulq_f16(l_c, vaddq_f16(vmulq_n_f16(y1, (float16_t)x1),
+                                    vmulq_n_f16(y2, (float16_t)x2)));
+  }
+}
+
+// Update the extrinsic information output from the decoding stage
+// based on the computed LLRs, the old extrinsic information and the input.
+inline void update_extrinsic(uint32_t len, const float16x8_t *llr,
+                             float16x8_t *extrinsic, const float16x8_t *input) {
+  for (uint32_t i = 0; i < len; i++) {
+    extrinsic[i] = vsubq_f16(vsubq_f16(llr[i], extrinsic[i]), input[i]);
+  }
+}
+
+// Calculate the trellis termination values. These are independent of the
+// extrinsic information and so can be done once without needing to be updated
+// on every iteration.
+float16x8_t trellis_termination(const float16x8_t *sys, const float16x8_t *par,
+                                uint32_t k8, float16x8_t l_c) {
+  // We handle the gammas for the trellis termination bits separately
+  // as the state transitions are different. The x_{kl} are never 1
+  // here, because we always use inputs of 0 to drive the trellis back
+  // to state 0 in the encoder, so we only need to consider a smaller
+  // number of state transitions. We also do not have any extrinsic
+  // information. Because some of the gamma terms will
+  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
+  // adding to them to beta or taking the max with them, compared with
+  // when we calculate beta in the main calculations.
+  float16x4_t pdf_00 = transition_pdf<1, 1>(l_c, sys[k8], par[k8]);
+  float16x4_t pdf_01 = transition_pdf<1, -1>(l_c, sys[k8], par[k8]);
+
+  float16x8_t g0102 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1],
+                       pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
+
+  float16x8_t b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2],
+                     pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]};
+
+  float16x8_t beta_term = vaddq_f16(g0102, b01);
+
+  float16x8_t g = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0],
+                   pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
+
+  float64x2_t beta_term_f64 = vreinterpretq_f64_f16(beta_term);
+  beta_term_f64 = vsetq_lane_f64(beta_term_f64[0], beta_term_f64, 1);
+  float16x8_t b0123 = vzip1q_f16(vreinterpretq_f16_f64(beta_term_f64),
+                                 vreinterpretq_f16_f64(beta_term_f64));
+
+  return vaddq_f16(g, b0123);
+}
+
+// A single max-log-MAP decoder that works on an array of systematic bits (sys),
+// an array of parity bits (par), and an array of extrinsic values from a
+// previous decoding stage (extrinsic)
+void decode_step(const float16x8_t *sys, const float16x8_t *par,
+                 const float16x8_t *extrinsic, uint32_t k8, float16x8_t *llr,
+                 float16x8_t *alpha, float16x8_t beta_tail, float16x4x8_t *pdf4,
+                 float16x8_t l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
+
+  // Start by computing the non-zero conditional state transition probabilities
+  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
+  // an AWGN channel (ignoring extrinsic information in l_uk):
+  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
+  // Here there are only 2 possible state transitions into each state
+  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
+  // terms.
+  for (uint32_t i = 0; i < k8; i++) {
+    // The x_{kl} values are the actual systematic and parity values that
+    // would result from the encoder having transited from state s' to s.
+    // They can only ever be either 0 or 1 so we precompute the four possible
+    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
+    // that these 0s and 1s have to be converted to 1s and -1s to match the
+    // values in y
+    //
+    // The y_{kl} values are the observed systematic and parity inputs.
+    // These have potentially been perturbed by noise on the channel
+    //
+    // Although each of the 8 states of the encoder has in theory 8
+    // predecessor states, the encoder's structure means that not all state
+    // transitions are possible. Each state actually only has 2 predecessor
+    // states so we only have to compute 16 non-zero values for each input
+    // LLR.
+    float16x8_t pdf_00 =
+        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_10 =
+        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_01 =
+        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float16x8_t pdf_11 =
+        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+
+    // There is considerable duplication in the values we could store. For
+    // example, for a single state the 16 gamma values are:
+    //
+    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
+    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
+    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
+    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
+    //
+    // We therefore choose to store the 4 unique pdf values (using st4)
+    // as this allows us to access the pdf values contiguously in the
+    // calculations needed for the alpha and beta values
+    vst4q_f16((float16_t *)&pdf4[i],
+              float16x8x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
+
+    // Accumulate the state transition probabilities forwards through the
+    // state transition trellis starting from the known encoder start state 0
+    for (uint32_t j = 0; j < 8; j++) {
+      k_idx = 8 * i + j;
+      kp1_idx = k_idx + 1;
+
+      float16x4_t fdp = vrev64_f16(pdf4[i].val[j]);
+
+      // We need  g02 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
+      //                 gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0],
+      //                 gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
+      //                 gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
+      float16x8_t g02 = vcombine_f16(pdf4[i].val[j], fdp);
+
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2],
+      //                 alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      float16x8_t a02 = vuzp1q_f16(alpha[k_idx], alpha[k_idx]);
+      float16x8_t left = vaddq_f16(g02, a02);
+
+      // This is g02 with the 64-bit elements swapped
+      float16x8_t g20 = vcombine_f16(fdp, pdf4[i].val[j]);
+
+      // We need  a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3],
+      //                 alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      float16x8_t a13 = vuzp2q_f16(alpha[k_idx], alpha[k_idx]);
+      float16x8_t right = vaddq_f16(g20, a13);
+
+      alpha[kp1_idx] = vmaxq_f16(left, right);
+
+      // Normalize alpha
+      if (j % 4 == 0) {
+        float16x8_t alpha0 = vdupq_n_f16(alpha[kp1_idx][0]);
+        alpha[kp1_idx] = vsubq_f16(alpha[kp1_idx], alpha0);
+      }
+    }
+  }
+
+  // Accumulate the state transition probabilities backwards through the state
+  // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1
+
+  constexpr uint8x16_t idx_even_odd = {0, 1, 4, 5, 8,  9,  12, 13,
+                                       2, 3, 6, 7, 10, 11, 14, 15};
+
+  constexpr uint8x16_t idx_05274163 = {0, 1, 10, 11, 4,  5,  14, 15,
+                                       8, 9, 2,  3,  12, 13, 6,  7};
+
+  constexpr uint8x16_t idx_0220 = {0, 1, 4, 5, 4, 5, 0, 1,
+                                   0, 1, 4, 5, 4, 5, 0, 1};
+
+  constexpr uint8x16_t idx_3113 = {6, 7, 2, 3, 2, 3, 6, 7,
+                                   6, 7, 2, 3, 2, 3, 6, 7};
+
+  constexpr uint8x16_t idx_0213 = {0, 1, 6, 7, 2, 3, 4, 5,
+                                   4, 5, 2, 3, 6, 7, 0, 1};
+
+  constexpr uint8x16_t idx_1302 = {6, 7, 0, 1, 4, 5, 2, 3,
+                                   2, 3, 4, 5, 0, 1, 6, 7};
+
+  float16x8_t beta_kp1 = beta_tail;
+
+  for (int32_t i = k8 - 1; i >= 0; i--) {
+    float16x8_t prob_0;
+    float16x8_t prob_1;
+
+    for (int32_t j = 7; j >= 0; j--) {
+      k_idx = 8 * i + j;
+
+      // Normalize beta
+      if (j % 4 == 0) {
+        float16x8_t beta0 = vdupq_n_f16(beta_kp1[0]);
+        beta_kp1 = vsubq_f16(beta_kp1, beta0);
+      }
+
+      uint8x16_t pdf8_u8 =
+          vreinterpretq_u8_f16(vcombine_f16(pdf4[i].val[j], pdf4[i].val[j]));
+
+      // g0213 = {pdf[0], pdf[3], pdf[1], pdf[2],
+      //          pdf[2], pdf[1], pdf[3], pdf[0]};
+      float16x8_t g0213 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_0213));
+
+      // Reverse 32-bit elements in g0213
+      // g1302 = {pdf[3], pdf[0], pdf[2], pdf[1],
+      //          pdf[1], pdf[2], pdf[0], pdf[3]};
+      float16x8_t g1302 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_1302));
+
+      // b0123 = {beta_kp1[0], beta_kp1[0], beta_kp1[1], beta_kp1[1],
+      //          beta_kp1[2], beta_kp1[2], beta_kp1[3], beta_kp1[3]};
+      // b4567 = {beta_kp1[4], beta_kp1[4], beta_kp1[5], beta_kp1[5],
+      //          beta_kp1[6], beta_kp1[6], beta_kp1[7], beta_kp1[7]};
+      float16x8_t b0123 = vzip1q_f16(beta_kp1, beta_kp1);
+      float16x8_t b4567 = vzip2q_f16(beta_kp1, beta_kp1);
+
+      float16x8_t left = vaddq_f16(g0213, b0123);
+      float16x8_t right = vaddq_f16(g1302, b4567);
+
+      float16x8_t beta_k = vmaxq_f16(left, right);
+
+      // a0213 = {alpha[k_idx][0], alpha[k_idx][2], alpha[k_idx][4], alpha[k_idx][6],
+      //          alpha[k_idx][1], alpha[k_idx][3], alpha[k_idx][5], alpha[k_idx][7]};
+      float16x8_t a0213 = vreinterpretq_f16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f16(alpha[k_idx]), idx_even_odd));
+
+      // b0213_1302 = {beta_kp1[0], beta_kp1[5], beta_kp1[2], beta_kp1[7],
+      //               beta_kp1[4], beta_kp1[1], beta_kp1[6], beta_kp1[3]};
+      float16x8_t b0213_1302 = vreinterpretq_f16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f16(beta_kp1), idx_05274163));
+      float16x8_t b1302_0213 = vextq_f16(b0213_1302, b0213_1302, 4);
+
+      // g0101 = {pdf[0], pdf[2], pdf[2], pdf[0]};
+      float16x8_t g0101 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_0220));
+
+      float16x8_t left_right_0 = vaddq_f16(vaddq_f16(a0213, b0213_1302), g0101);
+      float16x4_t left_0 = vget_low_f16(left_right_0);
+      float16x4_t right_0 = vget_high_f16(left_right_0);
+
+      // g1010 = {pdf[3], pdf[1], pdf[1], pdf[3]};
+      float16x8_t g1010 = vreinterpretq_f16_u8(vqtbl1q_u8(pdf8_u8, idx_3113));
+
+      float16x8_t left_right_1 = vaddq_f16(vaddq_f16(a0213, b1302_0213), g1010);
+
+      float16x4_t left_1 = vget_low_f16(left_right_1);
+      float16x4_t right_1 = vget_high_f16(left_right_1);
+
+      prob_0[j] = vmaxv_f16(vmax_f16(left_0, right_0));
+      prob_1[j] = vmaxv_f16(vmax_f16(left_1, right_1));
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
+    }
+
+    // Calculate the LLRs
+    llr[i] = vsubq_f16(prob_0, prob_1);
+  }
+}
+
+} // namespace
+
+// The template parameter allows us to disable checking for convergence (and
+// thus terminating the iterations early) so we always run a fixed number of
+// iterations in our benchmarking
+template<bool check_convergence, typename Allocator>
+void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
+                                 const int8_t *itl, uint32_t k, uint8_t *dst,
+                                 float32_t l_c, uint32_t max_iter,
+                                 Allocator &allocator) {
+  // This implements multiple steps of the max-log-MAP algorithm,
+  // which is an approximation to the MAP (BCJR) algorithm.
+  // It returns a hard decision rather than raw LLRs
+
+  // We will be working with float16x8_t, so work out how
+  // many of these will be needed to store k float16_ts.
+  // k is always a multiple of 8, so no need to worry about remainders.
+  uint32_t k8 = k >> 3;
+
+  auto sys_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+  auto par_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+  auto itl_f16 = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+
+  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
+  auto perm_sys = allocate_uninitialized<float16x8_t>(allocator, k8 + 1);
+
+  struct perm_pair {
+    uint16_t first;
+    uint16_t second;
+  };
+
+  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
+
+  // Allocate space to hold the extrinsic and permuted extrinsic information
+  // to be passed between the two decoders. Extrinsic is initially set to 0.
+  auto extrinsic = allocate_zeroed<float16x8_t>(allocator, k8);
+  auto perm_extrinsic = allocate_zeroed<float16x8_t>(allocator, k8);
+
+  // Allocate space for log likelihood ratios from both stages of decoding
+  auto l1_uky = allocate_uninitialized<float16x8_t>(allocator, k8);
+  auto l2_uky = allocate_uninitialized<float16x8_t>(allocator, k8);
+  auto prev_l2_uky = allocate_zeroed<float16x8_t>(allocator, k8);
+
+  // Allocate space to hold alpha and gamma
+  // alpha stores the forward-accumulated state probabilities for each decoded
+  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
+  // plus the starting condition
+  auto alpha = allocate_uninitialized<float16x8_t>(allocator, 8 * k8 + 1);
+
+  // gamma stores the conditional state transition probabilities for each of the
+  // k+3 bits to decode
+  auto gamma = allocate_uninitialized<float16x4x8_t>(allocator, k8);
+
+  // NOTE: All allocations done.
+  if constexpr (Allocator::is_counting) {
+    return;
+  }
+
+  // Convert our LLRs from int8_ts into float16_ts
+  convert_llrs(k, sys, sys_f16.get());
+  convert_llrs(k, par, par_f16.get());
+  convert_llrs(k, itl, itl_f16.get());
+
+  // Unperturb the trellis termination bits. They are transmitted as:
+  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
+  // Z'2
+  // but need to appended to the inputs as:
+  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
+  // Z'2
+  // We append to the systematic (X), the parity (Z) and the interleaved parity
+  // (Z') values here, and to the interleaved systematic values (X') further
+  // down.
+  sys_f16[k8][0] = (float16_t)sys[k];
+  sys_f16[k8][1] = (float16_t)itl[k];
+  sys_f16[k8][2] = (float16_t)par[k + 1];
+
+  par_f16[k8][0] = (float16_t)par[k];
+  par_f16[k8][1] = (float16_t)sys[k + 1];
+  par_f16[k8][2] = (float16_t)itl[k + 1];
+
+  itl_f16[k8][0] = (float16_t)par[k + 2];
+  itl_f16[k8][1] = (float16_t)sys[k + 3];
+  itl_f16[k8][2] = (float16_t)itl[k + 3];
+
+  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
+  const float16x8_t channel_reliability = vdupq_n_f16((float16_t)l_c / 2);
+
+  // Generate the permutation vector for the input value of k
+  // Find the index into the array of parameter arrays corresponding
+  // to the current k. Subtract 40 because k=40 is the lowest value.
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
+  // and extract the correct values of f1 and f2 to build the
+  // interleaving polynomial
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
+  for (uint32_t i = 0; i < k; i++) {
+    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
+  }
+
+  // Create a permuted version of the systematic output for use
+  // with the second decoder
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      perm_sys[i][j] = (float16_t)sys[perm_idx[(i * 8) + j]];
+    }
+  }
+  perm_sys[k8][0] = (float16_t)sys[k + 2];
+  perm_sys[k8][1] = (float16_t)itl[k + 2];
+  perm_sys[k8][2] = (float16_t)par[k + 3];
+
+  // Create a look-up of the permutation vector that maps [0,...k-1] indices
+  // to vector element/vector lane pairs. This avoids having to a modulo
+  // operator every time we want to apply the permutation to vector elements
+  for (uint32_t i = 0; i < k; i++) {
+    uint16_t vec_idx = perm_idx[i] / 8;
+    uint16_t vec_lane = perm_idx[i] % 8;
+    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
+  }
+
+  // Initialize alpha
+  alpha[0] = vdupq_n_f16(-INFINITY);
+  alpha[0][0] = 0;
+
+  // Calculate the trellis termination state transition probabilities, which
+  // do not require any extrinsic information
+  float16x8_t beta_tail = trellis_termination(sys_f16.get(), par_f16.get(), k8,
+                                              channel_reliability);
+  float16x8_t perm_beta_tail = trellis_termination(
+      perm_sys.get(), itl_f16.get(), k8, channel_reliability);
+
+  // Initialize the number of iterations
+  uint32_t num_iter = 0;
+
+  while (num_iter < max_iter) {
+    // Run the first decoder step
+    decode_step(sys_f16.get(), par_f16.get(), extrinsic.get(), k8, l1_uky.get(),
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
+
+    // Compute the new extrinsic information to pass into the second decoder
+    update_extrinsic(k8, l1_uky.get(), extrinsic.get(), sys_f16.get());
+
+    // Need to unpermute extrinsic to match input to second decoder
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 8 + j].first]
+                                        [perm_lookup[i * 8 + j].second];
+      }
+    }
+
+    // Run the second decoder step
+    decode_step(perm_sys.get(), itl_f16.get(), perm_extrinsic.get(), k8,
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
+
+    // Compute the new extrinsic information to pass back into the first encoder
+    update_extrinsic(k8, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
+
+    // But need to unpermute extrinsic first
+    for (uint32_t i = 0; i < k8; i++) {
+      for (uint32_t j = 0; j < 8; j++) {
+        extrinsic[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] =
+            perm_extrinsic[i][j];
+      }
+    }
+
+    // Compare this iteration's results with those from the previous iteration
+    float16_t max_abs_diff = 0.0;
+    float16_t max_abs_val = 0.0;
+    for (uint32_t i = 0; i < k8; i++) {
+      float16_t abs_diff = vmaxvq_f16(vabdq_f16(l2_uky[i], prev_l2_uky[i]));
+      float16_t abs_val = vmaxvq_f16(vabsq_f16(l2_uky[i]));
+      if (abs_diff > max_abs_diff) {
+        max_abs_diff = abs_diff;
+      }
+      if (abs_val > max_abs_val) {
+        max_abs_val = abs_val;
+      }
+    }
+
+    // If we've converged, finish decoding
+    if constexpr (check_convergence) {
+      if (max_abs_diff / max_abs_val <
+          std::numeric_limits<float16_t>::epsilon()) {
+        break;
+      }
+    }
+
+    // Store the current "final" LLRs to use in convergence checking next
+    // iteration
+    for (uint32_t i = 0; i < k8; i++) {
+      prev_l2_uky[i] = l2_uky[i];
+    }
+
+    num_iter++;
+  }
+
+  // Return unpermuted final output from second encoder
+  // Rather than allocate another new vector, copy into l1_uky and return that
+  for (uint32_t i = 0; i < k8; i++) {
+    for (uint32_t j = 0; j < 8; j++) {
+      l1_uky[perm_lookup[i * 8 + j].first][perm_lookup[i * 8 + j].second] =
+          l2_uky[i][j];
+    }
+  }
+
+  // Make a hard decision based on the final LLRs
+  turbo_llrs_to_bits(k, l1_uky.get(), dst);
+}
diff --git a/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp b/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp
new file mode 100644
index 0000000..1fdb679
--- /dev/null
+++ b/src/UpperPHY/Turbo/turbo_decoder_fp32.hpp
@@ -0,0 +1,533 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include <cmath>
+
+namespace {
+
+// With Turbo codes n (=k) is always divisible by 8 so we
+// do not have to worry about tail bits
+inline void turbo_llrs_to_bits(uint32_t n, const float32x4_t *llr,
+                               uint8_t *data_out) {
+  uint32_t full_bytes = n >> 3;
+  constexpr uint32x4_t ones_0 = {128, 64, 32, 16};
+  constexpr uint32x4_t ones_1 = {8, 4, 2, 1};
+
+  for (uint32_t i = 0; i < full_bytes; ++i) {
+    // The first bit to write in the byte is the most significant
+    uint32x4_t pred_0 = vcltzq_f32(llr[i * 2]);
+    uint32x4_t pred_1 = vcltzq_f32(llr[i * 2 + 1]);
+    uint32x4_t mask_0 = vandq_u32(pred_0, ones_0);
+    uint32x4_t mask_1 = vandq_u32(pred_1, ones_1);
+    uint32x4_t mask_2 = vorrq_u32(mask_0, mask_1);
+    data_out[i] = (uint8_t)vaddvq_u32(mask_2);
+  }
+}
+
+// Take the input int8_t LLRs and convert them to float32x4_ts
+inline void convert_llrs(uint32_t k, const int8_t *llrs,
+                         float32x4_t *llrs_f32) {
+  constexpr int8x16_t idx_0 = {127, 127, 127, 0, 127, 127, 127, 1,
+                               127, 127, 127, 2, 127, 127, 127, 3};
+  constexpr int8x16_t idx_1 = {127, 127, 127, 4, 127, 127, 127, 5,
+                               127, 127, 127, 6, 127, 127, 127, 7};
+  // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time
+  for (uint32_t i = 0, j = 0; i < k; i += 8, j += 2) {
+    int8x8_t data = vld1_s8(&llrs[i]);
+    int32x4_t ldata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_0));
+    int32x4_t hdata = vreinterpretq_s32_s8(vtbl1q_s8(data, idx_1));
+    llrs_f32[j] = vcvtq_n_f32_s32(ldata, 24);
+    llrs_f32[j + 1] = vcvtq_n_f32_s32(hdata, 24);
+  }
+}
+
+// Calculate the PDF of the state transition probability on the assumption that
+// we are operating on an AWGN channel:
+//    PDF = (x1/2 (l_uk + l_c*y1)) + (l_c/2 x2 y2)
+// In our implementation we assume the channel reliability, l_c,
+// has been prescaled by 1/2 to avoid doing so repeatedly here.
+template<int32_t x1, int32_t x2, bool use_extrinsic>
+inline float32x4_t transition_pdf(float32x4_t l_uk, float32x4_t l_c,
+                                  float32x4_t y1, float32x4_t y2) {
+  if constexpr (use_extrinsic) {
+    float32x4_t term1 =
+        vmulq_n_f32(vfmaq_f32(vmulq_n_f32(l_uk, 0.5F), l_c, y1), x1);
+    float32x4_t term2 = vmulq_f32(vmulq_n_f32(l_c, (float32_t)x2), y2);
+    return vaddq_f32(term1, term2);
+  } else {
+    return vmulq_f32(l_c, vaddq_f32(vmulq_n_f32(y1, (float32_t)x1),
+                                    vmulq_n_f32(y2, (float32_t)x2)));
+  }
+}
+
+// Update the extrinsic information output from the decoding stage
+// based on the computed LLRs, the old extrinsic information and the input.
+inline void update_extrinsic(uint32_t len, const float32x4_t *llr,
+                             float32x4_t *extrinsic, const float32x4_t *input) {
+  for (uint32_t i = 0; i < len; i++) {
+    extrinsic[i] = vsubq_f32(vsubq_f32(llr[i], extrinsic[i]), input[i]);
+  }
+}
+
+// Calculate the trellis termination values. These are independent of the
+// extrinsic information and so can be done once without needing to be updated
+// on every iteration.
+void trellis_termination(const float32x4_t *sys, const float32x4_t *par,
+                         uint32_t k4, float32x4_t l_c, float32x4_t *beta_out) {
+  // We handle the gammas for the trellis termination bits separately
+  // as the state transitions are different. The x_{kl} are never 1
+  // here, because we always use inputs of 0 to drive the trellis back
+  // to state 0 in the encoder, so we only need to consider a smaller
+  // number of state transitions. We also do not have any extrinsic
+  // information. Because some of the gamma terms will
+  // always be -INFINITY (specifically indices [1] and [3]) we can forgo
+  // adding to them to beta or taking the max with them, compared with
+  // when we calculate beta in the main calculations.
+  float32x4_t unused_extrinsic = {0};
+  float32x4_t pdf_00 =
+      transition_pdf<1, 1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
+  float32x4_t pdf_01 =
+      transition_pdf<1, -1, false>(unused_extrinsic, l_c, sys[k4], par[k4]);
+
+  // We need  b01 = {pdf_00[2], pdf_00[2], pdf_01[2], pdf_01[2]}
+  float32x4_t pdf_uzp1 = vuzp1q_f32(pdf_00, pdf_01);
+  float32x4_t b01 = vtrn2q_f32(pdf_uzp1, pdf_uzp1);
+
+  // We need  g01_02 = {pdf_00[1], pdf_01[1], pdf_00[1], pdf_01[1]};
+  float32x4_t pdf_uzp2 = vuzp2q_f32(pdf_00, pdf_01);
+  float32x4_t g01_02 = vuzp1q_f32(pdf_uzp2, pdf_uzp2);
+
+  float32x4_t beta_term = vaddq_f32(g01_02, b01);
+
+  // We need  g01_02_1 = {pdf_00[0], pdf_01[0], pdf_00[0], pdf_01[0]};
+  float32x4_t g01_02_1 = vuzp1q_f32(pdf_uzp1, pdf_uzp1);
+
+  // We need  b01_1 = {beta_term[0], beta_term[0], beta_term[1], beta_term[1]};
+  float32x4_t b01_1 = vzip1q_f32(beta_term, beta_term);
+  beta_out[0] = vaddq_f32(g01_02_1, b01_1);
+
+  // We need  g23_02_1 = {pdf_01[0], pdf_00[0], pdf_01[0], pdf_00[0]};
+  float32x4_t g23_02_1 = vrev64q_f32(g01_02_1);
+
+  // We need  b23_1 = {beta_term[2], beta_term[2], beta_term[3], beta_term[3]};
+  float32x4_t b23_1 = vzip2q_f32(beta_term, beta_term);
+  beta_out[1] = vaddq_f32(g23_02_1, b23_1);
+}
+
+// A single max-log-MAP decoder that works on an array of systematic bits (sys),
+// an array of parity bits (par), and an array of extrinsic values from a
+// previous decoding stage (extrinsic)
+void decode_step(const float32x4_t *sys, const float32x4_t *par,
+                 const float32x4_t *extrinsic, uint32_t k4, float32x4_t *llr,
+                 float32x4_t *alpha, const float32x4_t *beta_tail,
+                 float32x4x4_t *pdf4, float32x4_t l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
+
+  constexpr uint8x16_t rev_idx = {12, 13, 14, 15, 8, 9, 10, 11,
+                                  4,  5,  6,  7,  0, 1, 2,  3};
+
+  // Start by computing the non-zero conditional state transition probabilities
+  // from state s' to state s for every k, denoted gamma_k(s',s). In general for
+  // an AWGN channel (ignoring extrinsic information in l_uk):
+  //        gamma_k(s',s) = exp(L_c / 2 \sum_{l=1}^{n} x_{kl} y_{kl})
+  // Here there are only 2 possible state transitions into each state
+  // (corresponding to encoding a 0 bit or a 1 bit) so the summation only has 2
+  // terms.
+  for (uint32_t i = 0; i < k4; i++) {
+    // The x_{kl} values are the actual systematic and parity values that
+    // would result from the encoder having transited from state s' to s.
+    // They can only ever be either 0 or 1 so we precompute the four possible
+    // values in the exponential for x = (0,0), (0,1), (1,0) and (1,1). Note
+    // that these 0s and 1s have to be converted to 1s and -1s to match the
+    // values in y
+    //
+    // The y_{kl} values are the observed systematic and parity inputs.
+    // These have potentially been perturbed by noise on the channel
+    //
+    // Although each of the 8 states of the encoder has in theory 8
+    // predecessor states, the encoder's structure means that not all state
+    // transitions are possible. Each state actually only has 2 predecessor
+    // states so we only have to compute 16 non-zero values for each input
+    // LLR.
+    float32x4_t pdf_00 =
+        transition_pdf<1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_10 =
+        transition_pdf<-1, 1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_01 =
+        transition_pdf<1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+    float32x4_t pdf_11 =
+        transition_pdf<-1, -1, true>(extrinsic[i], l_c, sys[i], par[i]);
+
+    // There is considerable duplication in the values we could store. For
+    // example, for a single state the 16 gamma values are:
+    //
+    //      gamma[g_k_idx]   = {pdf_00[j], pdf_11[j], pdf_11[j], pdf_00[j]};
+    //      gamma[g_k_idx+1] = {pdf_10[j], pdf_01[j], pdf_01[j], pdf_10[j]};
+    //      gamma[g_k_idx+2] = {pdf_01[j], pdf_10[j], pdf_10[j], pdf_01[j]};
+    //      gamma[g_k_idx+3] = {pdf_11[j], pdf_00[j], pdf_00[j], pdf_11[j]};
+    //
+    // We therefore choose to store the 4 unique pdf values (using st4)
+    // as this allows us to access the pdf values contiguously in the
+    // calculations needed for the alpha and beta values
+    vst4q_f32((float32_t *)&pdf4[i],
+              float32x4x4_t({pdf_00, pdf_10, pdf_01, pdf_11}));
+
+    // Accumulate the state transition probabilities forwards through the
+    // state transition trellis starting from the known encoder start state 0
+    for (uint32_t j = 0; j < 4; j++) {
+      k_idx = 8 * i + j * 2;
+      kp1_idx = k_idx + 2;
+
+      // We need  g0 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
+      //                gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0]};
+      //         a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      float32x4_t g0 = pdf4[i].val[j];
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t left_1 = vaddq_f32(g0, a02);
+      // We need  g2 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
+      //                gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      float32x4_t g2 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), rev_idx));
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t right_1 = vaddq_f32(g2, a13);
+      alpha[kp1_idx] = vmaxq_f32(left_1, right_1);
+
+      // We need  g1 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][1],
+      //                gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][1]};
+      // which is g2 above
+      float32x4_t left_2 = vaddq_f32(g2, a02);
+      // We need  g3 = {gamma[g_k_idx][3], gamma[g_k_idx + 1][3],
+      //                gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][3]};
+      // which is g0 above
+      float32x4_t right_2 = vaddq_f32(g0, a13);
+      alpha[kp1_idx + 1] = vmaxq_f32(left_2, right_2);
+    }
+  }
+
+  // Accumulate the state transition probabilities backwards through the state
+  // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1
+  constexpr uint8x16_t idx_0312 = {0, 1, 2, 3, 12, 13, 14, 15,
+                                   4, 5, 6, 7, 8,  9,  10, 11};
+  constexpr uint8x16_t idx_3021 = {12, 13, 14, 15, 0, 1, 2, 3,
+                                   8,  9,  10, 11, 4, 5, 6, 7};
+  constexpr uint8x16_t idx_2130 = {8,  9,  10, 11, 4, 5, 6, 7,
+                                   12, 13, 14, 15, 0, 1, 2, 3};
+  constexpr uint8x16_t idx_1203 = {4, 5, 6, 7, 8,  9,  10, 11,
+                                   0, 1, 2, 3, 12, 13, 14, 15};
+  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
+                                   8, 9, 10, 11, 0, 1, 2,  3};
+  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
+                                   4,  5,  6,  7,  12, 13, 14, 15};
+
+  float32x4x2_t beta_k;
+  float32x4x2_t beta_kp1 = {beta_tail[0], beta_tail[1]};
+
+  for (int32_t i = k4 - 1; i >= 0; i--) {
+    float32x4_t prob_0;
+    float32x4_t prob_1;
+    for (int32_t j = 3; j >= 0; j--) {
+      k_idx = 8 * i + j * 2;
+
+      // We need  g01_02 = {gamma[g_k_idx][0], gamma[g_k_idx][2],
+      //                    gamma[g_k_idx + 1][0], gamma[g_k_idx + 1][2]};
+      //          b01 = {beta[b_kp1_idx][0], beta[b_kp1_idx][0],
+      //                 beta[b_kp1_idx][1], beta[b_kp1_idx][1]};
+      float32x4_t g01_02 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0312));
+      float32x4_t b01 = vzip1q_f32(beta_kp1.val[0], beta_kp1.val[0]);
+      float32x4_t left_1 = vaddq_f32(g01_02, b01);
+
+      // We need  g13 = {gamma[g_k_idx][1], gamma[g_k_idx][3],
+      //                 gamma[g_k_idx + 1][1], gamma[g_k_idx + 1][3]};
+      //          bp1_01 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx + 1][0],
+      //                    beta[b_kp1_idx + 1][1], beta[b_kp1_idx + 1][1]};
+      float32x4_t g13 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3021));
+      float32x4_t bp1_01 = vzip1q_f32(beta_kp1.val[1], beta_kp1.val[1]);
+      float32x4_t right_1 = vaddq_f32(g13, bp1_01);
+      beta_k.val[0] = vmaxq_f32(left_1, right_1);
+
+      // We need  g23_02 = {gamma[g_k_idx + 2][0], gamma[g_k_idx + 2][2],
+      //                    gamma[g_k_idx + 3][0], gamma[g_k_idx + 3][2]};
+      // We need  b23 = {beta[b_kp1_idx][2], beta[b_kp1_idx][2],
+      //                 beta[b_kp1_idx][3], beta[b_kp1_idx][3]};
+      float32x4_t g23_02 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_2130));
+      float32x4_t b23 = vzip2q_f32(beta_kp1.val[0], beta_kp1.val[0]);
+      float32x4_t left_2 = vaddq_f32(g23_02, b23);
+
+      // We need  g23_13 = {gamma[g_k_idx + 2][1], gamma[g_k_idx + 2][3],
+      //                    gamma[g_k_idx + 3][1], gamma[g_k_idx + 3][3]};
+      //          bp1_23 = {beta[b_kp1_idx + 1][2], beta[b_kp1_idx + 1][2],
+      //                    beta[b_kp1_idx + 1][3], beta[b_kp1_idx + 1][3]};
+      float32x4_t g23_13 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_1203));
+      float32x4_t bp1_23 = vzip2q_f32(beta_kp1.val[1], beta_kp1.val[1]);
+      float32x4_t right_2 = vaddq_f32(g23_13, bp1_23);
+      beta_k.val[1] = vmaxq_f32(left_2, right_2);
+
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
+      //       b02_13 = {beta[b_kp1_idx][0], beta[b_kp1_idx + 1][1],
+      //                 beta[b_kp1_idx][2], beta[b_kp1_idx + 1][3]};
+      //       b13_02 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx][1],
+      //                 beta[b_kp1_idx + 1][2], beta[b_kp1_idx][3]};
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t b02_13 =
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[0]), beta_kp1.val[1]);
+      float32x4_t b13_02 =
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[1]), beta_kp1.val[0]);
+
+      // Find the most probable path in which bit i was a 0
+      // We need  g01_01 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][1],
+      //                   gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][1]};
+      float32x4_t g01_01 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0220));
+      left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
+      right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
+      prob_0[j] = vmaxvq_f32(vmaxq_f32(left_1, right_1));
+
+      // Find the most probable path in which bit i was a 1
+      // We need  g10_10 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][0],
+      //                   gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][0]};
+      float32x4_t g10_10 = vreinterpretq_f32_u8(
+          vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3113));
+      left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
+      right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
+      prob_1[j] = vmaxvq_f32(vmaxq_f32(left_2, right_2));
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
+    }
+
+    // Calculate the LLRs
+    llr[i] = vsubq_f32(prob_0, prob_1);
+  }
+}
+
+} // namespace
+
+// The template parameter allows us to disable checking for convergence (and
+// thus terminating the iterations early) so we always run a fixed number of
+// iterations in our benchmarking
+template<bool check_convergence, typename Allocator>
+void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
+                                 const int8_t *itl, uint32_t k, uint8_t *dst,
+                                 float32_t l_c, uint32_t max_iter,
+                                 Allocator &allocator) {
+  // This implements multiple steps of the max-log-MAP algorithm,
+  // which is an approximation to the MAP (BCJR) algorithm.
+  // It returns a hard decision rather than raw LLRs
+
+  // We will be working with float32x4_t, so work out how
+  // many of these will be needed to store k float32_ts.
+  // k is always a multiple of 8, so no need to worry about remainders.
+  uint32_t k4 = k >> 2;
+
+  auto sys_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+  auto par_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+  auto itl_f32 = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+
+  auto perm_idx = allocate_uninitialized<uint16_t>(allocator, k);
+  auto perm_sys = allocate_uninitialized<float32x4_t>(allocator, k4 + 1);
+
+  struct perm_pair {
+    uint16_t first;
+    uint16_t second;
+  };
+
+  auto perm_lookup = allocate_uninitialized<perm_pair>(allocator, k);
+
+  // Allocate space to hold the extrinsic and permuted extrinsic information
+  // to be passed between the two decoders. Extrinsic is initially set to 0.
+  auto extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
+  auto perm_extrinsic = allocate_zeroed<float32x4_t>(allocator, k4);
+
+  // Allocate space for log likelihood ratios from both stages of decoding
+  auto l1_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
+  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
+  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k4);
+
+  // Allocate space to hold alpha and gamma
+  // alpha stores the forward-accumulated state probabilities for each decoded
+  // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
+  // plus the starting condition
+  auto alpha = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
+  // gamma stores the conditional state transition probabilities for each of the
+  // k+3 bits to decode
+  auto gamma = allocate_uninitialized<float32x4x4_t>(allocator, k4);
+
+  // NOTE: All allocations done.
+  if constexpr (Allocator::is_counting) {
+    return;
+  }
+
+  // Convert our LLRs from int8_ts into float32_ts
+  convert_llrs(k, sys, sys_f32.get());
+  convert_llrs(k, par, par_f32.get());
+  convert_llrs(k, itl, itl_f32.get());
+
+  // Unperturb the trellis termination bits. They are transmitted as:
+  // <systematic> X0 Z1 X'0 Z'1 <parity> Z0 X2 Z'0 X'2 <interleaved> X1 Z2 X'1
+  // Z'2
+  // but need to appended to the inputs as:
+  // <systematic> X0 X1 X2 <parity> Z0 Z1 Z2 X'0 X'1 X'2 <interleaved> Z'0 Z'1
+  // Z'2
+  // We append to the systematic (X), the parity (Z) and the interleaved parity
+  // (Z') values here, and to the interleaved systematic values (X') further
+  // down.
+  sys_f32[k4][0] = (float32_t)sys[k];
+  sys_f32[k4][1] = (float32_t)itl[k];
+  sys_f32[k4][2] = (float32_t)par[k + 1];
+
+  par_f32[k4][0] = (float32_t)par[k];
+  par_f32[k4][1] = (float32_t)sys[k + 1];
+  par_f32[k4][2] = (float32_t)itl[k + 1];
+
+  itl_f32[k4][0] = (float32_t)par[k + 2];
+  itl_f32[k4][1] = (float32_t)sys[k + 3];
+  itl_f32[k4][2] = (float32_t)itl[k + 3];
+
+  // Prescale l_c to avoid doing it repeatedly in the PDF calculations later.
+  const float32x4_t channel_reliability = vdupq_n_f32(l_c / 2);
+
+  // Generate the permutation vector for the input value of k
+  // Find the index into the array of parameter arrays corresponding
+  // to the current k. Subtract 40 because k=40 is the lowest value.
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
+  // and extract the correct values of f1 and f2 to build the
+  // interleaving polynomial
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
+  for (uint32_t i = 0; i < k; i++) {
+    perm_idx[i] = generate_perm_idx(i, f1, f2, k);
+  }
+
+  // Create a permuted version of the systematic output for use
+  // with the second decoder
+  for (uint32_t i = 0; i < k4; i++) {
+    for (uint32_t j = 0; j < 4; j++) {
+      perm_sys[i][j] = (float32_t)sys[perm_idx[(i * 4) + j]];
+    }
+  }
+  perm_sys[k4][0] = (float32_t)sys[k + 2];
+  perm_sys[k4][1] = (float32_t)itl[k + 2];
+  perm_sys[k4][2] = (float32_t)par[k + 3];
+
+  // Create a look-up of the permutation vector that maps [0,...k-1] indices
+  // to vector element/vector lane pairs. This avoids having to a modulo
+  // operator every time we want to apply the permutation to vector elements
+  for (uint32_t i = 0; i < k; i++) {
+    uint16_t vec_idx = perm_idx[i] / 4;
+    uint16_t vec_lane = perm_idx[i] % 4;
+    perm_lookup[i] = perm_pair{vec_idx, vec_lane};
+  }
+
+  // Separate arrays to hold the betas of the trellis termination bits for the
+  // original and permuted inputs
+  float32x4_t beta_tail[2];
+  float32x4_t perm_beta_tail[2];
+
+  // Initialize alpha
+  alpha[0] = vdupq_n_f32(-INFINITY);
+  alpha[1] = vdupq_n_f32(-INFINITY);
+  alpha[0][0] = 0;
+
+  // Calculate the trellis termination state transition probabilities, which
+  // do not require any extrinsic information
+  trellis_termination(sys_f32.get(), par_f32.get(), k4, channel_reliability,
+                      beta_tail);
+  trellis_termination(perm_sys.get(), itl_f32.get(), k4, channel_reliability,
+                      perm_beta_tail);
+
+  // Initialize the number of iterations
+  uint32_t num_iter = 0;
+
+  while (num_iter < max_iter) {
+    // Run the first decoder step
+    decode_step(sys_f32.get(), par_f32.get(), extrinsic.get(), k4, l1_uky.get(),
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
+
+    // Compute the new extrinsic information to pass into the second decoder
+    update_extrinsic(k4, l1_uky.get(), extrinsic.get(), sys_f32.get());
+
+    // Need to unpermute extrinsic to match input to second decoder
+    for (uint32_t i = 0; i < k4; i++) {
+      for (uint32_t j = 0; j < 4; j++) {
+        perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 4 + j].first]
+                                        [perm_lookup[i * 4 + j].second];
+      }
+    }
+
+    // Run the second decoder step
+    decode_step(perm_sys.get(), itl_f32.get(), perm_extrinsic.get(), k4,
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
+
+    // Compute the new extrinsic information to pass back into the first encoder
+    update_extrinsic(k4, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
+
+    // But need to unpermute extrinsic first
+    for (uint32_t i = 0; i < k4; i++) {
+      for (uint32_t j = 0; j < 4; j++) {
+        extrinsic[perm_lookup[i * 4 + j].first][perm_lookup[i * 4 + j].second] =
+            perm_extrinsic[i][j];
+      }
+    }
+
+    // Compare this iteration's results with those from the previous iteration
+    float32_t max_abs_diff = 0.0;
+    float32_t max_abs_val = 0.0;
+    for (uint32_t i = 0; i < k4; i++) {
+      float32_t abs_diff = vmaxvq_f32(vabdq_f32(l2_uky[i], prev_l2_uky[i]));
+      float32_t abs_val = vmaxvq_f32(vabsq_f32(l2_uky[i]));
+      if (abs_diff > max_abs_diff) {
+        max_abs_diff = abs_diff;
+      }
+      if (abs_val > max_abs_val) {
+        max_abs_val = abs_val;
+      }
+    }
+
+    // If we've converged, finish decoding
+    if constexpr (check_convergence) {
+      if (max_abs_diff / max_abs_val <
+          std::numeric_limits<float32_t>::epsilon()) {
+        break;
+      }
+    }
+
+    // Store the current "final" LLRs to use in convergence checking next
+    // iteration
+    for (uint32_t i = 0; i < k4; i++) {
+      prev_l2_uky[i] = l2_uky[i];
+    }
+
+    num_iter++;
+  }
+
+  // Return unpermuted final output from second encoder
+  // Rather than allocate another new vector, copy into l1_uky and return that
+  for (uint32_t i = 0; i < k4; i++) {
+    for (uint32_t j = 0; j < 4; j++) {
+      l1_uky[perm_lookup[i * 4 + j].first][perm_lookup[i * 4 + j].second] =
+          l2_uky[i][j];
+    }
+  }
+
+  // Make a hard decision based on the final LLRs
+  turbo_llrs_to_bits(k, l1_uky.get(), dst);
+}
diff --git a/src/UpperPHY/Turbo/turbo_tables.hpp b/src/UpperPHY/Turbo/turbo_tables.hpp
index 1a59ae8..f2de1e2 100644
--- a/src/UpperPHY/Turbo/turbo_tables.hpp
+++ b/src/UpperPHY/Turbo/turbo_tables.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 7fd26f0..5e35954 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp
index e664173..1876dd2 100644
--- a/src/utils/allocators.hpp
+++ b/src/utils/allocators.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/cmplx_arith_f32.hpp b/src/utils/cmplx_arith_f32.hpp
index 32644da..87022ed 100644
--- a/src/utils/cmplx_arith_f32.hpp
+++ b/src/utils/cmplx_arith_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/vec_mul.hpp b/src/utils/vec_mul.hpp
index 2c4896a..a352058 100644
--- a/src/utils/vec_mul.hpp
+++ b/src/utils/vec_mul.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/test/MatrixInv/batch/main.cpp b/test/BasicMathFun/MatrixInv/Batch/main.cpp
similarity index 87%
rename from test/MatrixInv/batch/main.cpp
rename to test/BasicMathFun/MatrixInv/Batch/main.cpp
index 74b1fcd..c8ff2e5 100644
--- a/test/MatrixInv/batch/main.cpp
+++ b/test/BasicMathFun/MatrixInv/Batch/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -45,8 +45,9 @@ static void reference_parallel_matinv_block(uint32_t m,
  * generated input matrix
  */
 static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
-                                            bool is_hpd, float scale_re = 1.0F,
-                                            float scale_im = 1.0F) {
+                                            bool is_hpd,
+                                            float32_t scale_re = 1.0F,
+                                            float32_t scale_im = 1.0F) {
   printf("\n*****  test_batch_hermitian_matrix_%uX%u_rand () [BATCH] "
          "[armral_cmplx_f32_t], %s input matrix (scale={%.2f,%.2f}) *****\n",
          m, m, is_hpd ? "HPD" : "Hermitian", scale_re, scale_im);
@@ -82,9 +83,9 @@ static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_HER_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_HER_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -94,8 +95,8 @@ static bool run_batch_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
  */
 static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
                                                bool is_hpd,
-                                               float scale_re = 1.0F,
-                                               float scale_im = 1.0F) {
+                                               float32_t scale_re = 1.0F,
+                                               float32_t scale_im = 1.0F) {
   printf("\n*****  test_batch_pa_hermitian_matrix_%uX%u_rand () [BATCH PA] "
          "[armral_cmplx_f32_t], %s input matrix (scale={%.2f,%.2f}) *****\n",
          m, m, is_hpd ? "HPD" : "Hermitian", scale_re, scale_im);
@@ -138,9 +139,9 @@ static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_HER_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_HER_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -149,8 +150,8 @@ static bool run_batch_pa_hermitian_matinv_test(uint32_t batch_size, uint32_t m,
  * generated input matrix
  */
 static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
-                                  float scale_re = 1.0F,
-                                  float scale_im = 1.0F) {
+                                  float32_t scale_re = 1.0F,
+                                  float32_t scale_im = 1.0F) {
   printf(
       "\n*****  test_batch_matrix_%uX%u_rand () [BATCH] "
       "[armral_cmplx_f32_t], general input matrix (scale={%.2f,%.2f}) *****\n",
@@ -185,9 +186,9 @@ static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_GEN_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_GEN_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
@@ -196,8 +197,8 @@ static bool run_batch_matinv_test(uint32_t batch_size, uint32_t m,
  * for randomly generated input matrix
  */
 static bool run_batch_pa_matinv_test(uint32_t batch_size, uint32_t m,
-                                     float scale_re = 1.0F,
-                                     float scale_im = 1.0F) {
+                                     float32_t scale_re = 1.0F,
+                                     float32_t scale_im = 1.0F) {
   printf(
       "\n*****  test_batch_pa_matrix_%uX%u_rand () [BATCH PA] "
       "[armral_cmplx_f32_t], general input matrix (scale={%.2f,%.2f}) *****\n",
@@ -240,9 +241,9 @@ static bool run_batch_pa_matinv_test(uint32_t batch_size, uint32_t m,
       passed = false; // GCOVR_EXCL_LINE
     }
   }
-  passed &= check_results_mat_inv("RAND_PARA_GEN_MAT_INV", (float *)res.data(),
-                                  (float *)ref.data(), batch_size * 2 * m * m,
-                                  (float)m, (float)m);
+  passed &= check_results_mat_inv(
+      "RAND_PARA_GEN_MAT_INV", (float32_t *)res.data(), (float32_t *)ref.data(),
+      batch_size * 2 * m * m, (float32_t)m, (float32_t)m);
   return passed;
 }
 
diff --git a/test/MatrixInv/single/main.cpp b/test/BasicMathFun/MatrixInv/Single/main.cpp
similarity index 83%
rename from test/MatrixInv/single/main.cpp
rename to test/BasicMathFun/MatrixInv/Single/main.cpp
index 9d7e3f9..480b5d7 100644
--- a/test/MatrixInv/single/main.cpp
+++ b/test/BasicMathFun/MatrixInv/Single/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -17,8 +17,8 @@
  * have linearly independent rows/cols and thus be invertible.
  */
 static bool run_hermitian_matinv_test(uint32_t m, bool enable_id_check,
-                                      bool is_hpd, float scale_re = 1.0F,
-                                      float scale_im = 1.0F) {
+                                      bool is_hpd, float32_t scale_re = 1.0F,
+                                      float32_t scale_im = 1.0F) {
   printf("\n*****  test_hermitian_matrix_%uX%u_rand () [SINGLE] "
          "[armral_cmplx_f32_t], "
          "%s input matrix (scale={%.2f,%.2f}) *****\n",
@@ -44,15 +44,15 @@ static bool run_hermitian_matinv_test(uint32_t m, bool enable_id_check,
     passed = false;
   }
 
-  passed &=
-      check_results_mat_inv("RAND_HER_MAT_INV", (float *)res.data(),
-                            (float *)ref.data(), 2 * m * m, (float)m, (float)m);
+  passed &= check_results_mat_inv("RAND_HER_MAT_INV", (float32_t *)res.data(),
+                                  (float32_t *)ref.data(), 2 * m * m,
+                                  (float32_t)m, (float32_t)m);
   return passed;
 }
 
 static bool run_general_matinv_test(uint32_t m, bool enable_id_check,
-                                    float scale_re = 1.0F,
-                                    float scale_im = 1.0F) {
+                                    float32_t scale_re = 1.0F,
+                                    float32_t scale_im = 1.0F) {
   printf("\n*****  test_general_mat_inverse_%uX%u_rand () [SINGLE] "
          "[armral_cmplx_f32_t], "
          "input matrix (scale={%.2f,%.2f}) *****\n",
@@ -77,9 +77,9 @@ static bool run_general_matinv_test(uint32_t m, bool enable_id_check,
     passed = false;
   }
 
-  passed &=
-      check_results_mat_inv("RAND_MAT_INV", (float *)res.data(),
-                            (float *)ref.data(), 2 * m * m, (float)m, (float)m);
+  passed &= check_results_mat_inv("RAND_MAT_INV", (float32_t *)res.data(),
+                                  (float32_t *)ref.data(), 2 * m * m,
+                                  (float32_t)m, (float32_t)m);
   return passed;
 }
 
diff --git a/test/MatrixMult/batch/ArmSolve/main.cpp b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
similarity index 97%
rename from test/MatrixMult/batch/ArmSolve/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
index 4ce7201..97d3776 100644
--- a/test/MatrixMult/batch/ArmSolve/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "cs16_utils.hpp"
@@ -93,8 +93,8 @@ run_reference_solve(uint32_t num_sub_carrier, uint32_t sc_per_g,
                     armral_cmplx_int16_t *p_x, uint32_t p_xstride) {
   int total_cols = num_sub_carrier;
   for (int j = 0; j < total_cols; ++j) {
-    float *g_re = &p_g_real[j / sc_per_g];
-    float *g_im = &p_g_imag[j / sc_per_g];
+    float32_t *g_re = &p_g_real[j / sc_per_g];
+    float32_t *g_im = &p_g_imag[j / sc_per_g];
     for (int i = 0; i < X; ++i) {
       std::complex<double> acc = 0;
       for (int k = 0; k < Y; ++k) {
diff --git a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
similarity index 98%
rename from test/MatrixMult/batch/MatrixVectorMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
index c8e8510..d47fc50 100644
--- a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "matrix_utils.hpp"
diff --git a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
similarity index 97%
rename from test/MatrixMult/batch/MatrixVectorMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
index 41f3c73..9c56ee4 100644
--- a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "cf32_utils.hpp"
diff --git a/test/MatrixMult/single/MatrixMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
index e882b7d..e0322ac 100644
--- a/test/MatrixMult/single/MatrixMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "reference_linalg.hpp"
 
diff --git a/test/MatrixMult/single/MatrixMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
index a77c7f8..3c70e00 100644
--- a/test/MatrixMult/single/MatrixMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
@@ -44,8 +44,8 @@ static bool run_specific_2x2_iq_matmul_test() {
   const auto a_im = unpack_imag_cf32(a);
   const auto b_re = unpack_real_cf32(b);
   const auto b_im = unpack_imag_cf32(b);
-  std::vector<float> c_re(4);
-  std::vector<float> c_im(4);
+  std::vector<float32_t> c_re(4);
+  std::vector<float32_t> c_im(4);
   armral_cmplx_mat_mult_2x2_f32_iq(a_re.data(), a_im.data(), b_re.data(),
                                    b_im.data(), c_re.data(), c_im.data());
   const auto c = pack_cf32(c_re, c_im);
@@ -80,8 +80,8 @@ static bool run_specific_4x4_iq_matmul_test() {
   const auto a_im = unpack_imag_cf32(a);
   const auto b_re = unpack_real_cf32(b);
   const auto b_im = unpack_imag_cf32(b);
-  std::vector<float> c_re(16);
-  std::vector<float> c_im(16);
+  std::vector<float32_t> c_re(16);
+  std::vector<float32_t> c_im(16);
   armral_cmplx_mat_mult_4x4_f32_iq(a_re.data(), a_im.data(), b_re.data(),
                                    b_im.data(), c_re.data(), c_im.data());
   const auto c = pack_cf32(c_re, c_im);
diff --git a/test/MatrixMult/single/MatrixMultAAH32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
similarity index 88%
rename from test/MatrixMult/single/MatrixMultAAH32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
index 83c4e01..854bb26 100644
--- a/test/MatrixMult/single/MatrixMultAAH32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "matrix_utils.hpp"
diff --git a/test/MatrixMult/single/MatrixMultAHB32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
similarity index 93%
rename from test/MatrixMult/single/MatrixMultAHB32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
index c09b5b7..883f8bb 100644
--- a/test/MatrixMult/single/MatrixMultAHB32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include <array>
diff --git a/test/MatrixMult/single/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
similarity index 94%
rename from test/MatrixMult/single/MatrixVectorMult16/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
index f6d987d..c859193 100644
--- a/test/MatrixMult/single/MatrixVectorMult16/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "reference_linalg.hpp"
diff --git a/test/MatrixMult/single/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
similarity index 89%
rename from test/MatrixMult/single/MatrixVectorMult32/main.cpp
rename to test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
index 4283a28..d186048 100644
--- a/test/MatrixMult/single/MatrixVectorMult32/main.cpp
+++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
diff --git a/test/MatrixPseudoInv/direct/main.cpp b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
similarity index 82%
rename from test/MatrixPseudoInv/direct/main.cpp
rename to test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
index 74f99d8..e644718 100644
--- a/test/MatrixPseudoInv/direct/main.cpp
+++ b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "matrix_utils.hpp"
@@ -27,7 +27,12 @@ reference_left_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
 
   // Compute B = C^(-1)
   std::vector<armral_cmplx_f32_t> mat_inv(n * n);
-  reference_matinv_block(n, mat_aha, mat_inv.data());
+  if (n == 1) {
+    mat_inv[0].re = 1.F / mat_aha[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    reference_matinv_block(n, mat_aha, mat_inv.data());
+  }
 
   // Compute B * A^H
   reference_matmul_bah_cf32(m, n, p_src, mat_inv.data(), p_dst);
@@ -50,7 +55,12 @@ static inline void reference_right_pseudo_inverse_direct(
 
   // Compute B = C^(-1)
   std::vector<armral_cmplx_f32_t> mat_inv(m * m);
-  reference_matinv_block(m, mat_aah, mat_inv.data());
+  if (m == 1) {
+    mat_inv[0].re = 1.F / mat_aah[0].re;
+    mat_inv[0].im = 0.F;
+  } else {
+    reference_matinv_block(m, mat_aah, mat_inv.data());
+  }
 
   // Compute A^H * B
   reference_matmul_ahb_cf32(m, n, m, p_src, mat_inv.data(), p_dst);
@@ -89,13 +99,14 @@ bool run_all_tests(char const *test_name, char const *function_name,
   bool passed = true;
 
   const std::tuple<uint32_t, uint32_t, float32_t> params[] = {
-      {2, 5, -0.968591},   {2, 84, 0.191647},   {2, 2, 1.457848},
-      {2, 67, 0.0},        {3, 18, -1.218053},  {3, 138, 1.597186},
-      {3, 3, -1.2435186},  {3, 161, 0.0},       {4, 20, -0.474817},
-      {4, 105, 0.944802},  {4, 4, 1.645646},    {4, 94, 0.0},
-      {8, 35, -1.991369},  {8, 200, -1.244298}, {8, 8, 1.445767},
-      {8, 190, 0.0},       {16, 32, 0.809352},  {16, 80, 1.810591},
-      {16, 16, -0.426745}, {16, 117, 0.0}};
+      {1, 1, 0.186745},   {1, 21, -0.314205},  {1, 66, 1.495806},
+      {1, 121, 0.0},      {2, 5, -0.968591},   {2, 84, 0.191647},
+      {2, 2, 1.457848},   {2, 67, 0.0},        {3, 18, -1.218053},
+      {3, 138, 1.597186}, {3, 3, -1.2435186},  {3, 161, 0.0},
+      {4, 20, -0.474817}, {4, 105, 0.944802},  {4, 4, 1.645646},
+      {4, 94, 0.0},       {8, 35, -1.991369},  {8, 200, -1.244298},
+      {8, 8, 1.445767},   {8, 190, 0.0},       {16, 32, 0.809352},
+      {16, 80, 1.810591}, {16, 16, -0.426745}, {16, 117, 0.0}};
   for (const auto &[dim1, dim2, l] : params) {
     printf("[%s] m=%d, n=%d, l=%f\n", function_name, dim1, dim2, l);
     passed &= run_pseudo_inverse_direct_cf32_test(function_name, dim1, dim2, l,
diff --git a/test/VectorDotProd/vecDot16/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
similarity index 91%
rename from test/VectorDotProd/vecDot16/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
index 0812631..6003f61 100644
--- a/test/VectorDotProd/vecDot16/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
similarity index 92%
rename from test/VectorDotProd/vecDot16_2/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
index 764b191..8ac1092 100644
--- a/test/VectorDotProd/vecDot16_2/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_2_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
similarity index 93%
rename from test/VectorDotProd/vecDot16_2_32bit/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
index 8fa6efb..904b13f 100644
--- a/test/VectorDotProd/vecDot16_2_32bit/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
similarity index 92%
rename from test/VectorDotProd/vecDot16_32bit/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
index f400649..b4784ff 100644
--- a/test/VectorDotProd/vecDot16_32bit/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot32/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
similarity index 82%
rename from test/VectorDotProd/vecDot32/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
index beca6f0..a72d8ea 100644
--- a/test/VectorDotProd/vecDot32/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -19,7 +19,7 @@ static bool run_vec_dot_test(uint32_t num_samples) {
   for (uint32_t i = 0; i < num_samples; ++i) {
     acc += cmplx_mul_widen_cf32(a[i], b[i]);
   }
-  armral_cmplx_f32_t ref{(float)acc.real(), (float)acc.imag()};
+  armral_cmplx_f32_t ref{(float32_t)acc.real(), (float32_t)acc.imag()};
 
   return check_results_cf32(NAME, c.data(), &ref, 1);
 }
diff --git a/test/VectorDotProd/vecDot32_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
similarity index 87%
rename from test/VectorDotProd/vecDot32_2/main.cpp
rename to test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
index b40850c..232c14b 100644
--- a/test/VectorDotProd/vecDot32_2/main.cpp
+++ b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -28,7 +28,7 @@ static bool run_vec_dot_test(uint32_t num_samples) {
   for (uint32_t i = 0; i < num_samples; ++i) {
     acc += cmplx_mul_widen_cf32(a[i], b[i]);
   }
-  armral_cmplx_f32_t ref{(float)acc.real(), (float)acc.imag()};
+  armral_cmplx_f32_t ref{(float32_t)acc.real(), (float32_t)acc.imag()};
 
   return check_results_cf32(NAME, c.data(), &ref, 1);
 }
diff --git a/test/ElemWiseVectorMult/vecMul16/main.cpp b/test/BasicMathFun/VectorMult/VecMul16/main.cpp
similarity index 97%
rename from test/ElemWiseVectorMult/vecMul16/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul16/main.cpp
index ea5da35..aadebde 100644
--- a/test/ElemWiseVectorMult/vecMul16/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/ElemWiseVectorMult/vecMul16_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
similarity index 97%
rename from test/ElemWiseVectorMult/vecMul16_2/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
index ed6cb76..3fa482b 100644
--- a/test/ElemWiseVectorMult/vecMul16_2/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/ElemWiseVectorMult/vecMul32/main.cpp b/test/BasicMathFun/VectorMult/VecMul32/main.cpp
similarity index 89%
rename from test/ElemWiseVectorMult/vecMul32/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul32/main.cpp
index 9ac33ac..0455a42 100644
--- a/test/ElemWiseVectorMult/vecMul32/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/ElemWiseVectorMult/vecMul32_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
similarity index 92%
rename from test/ElemWiseVectorMult/vecMul32_2/main.cpp
rename to test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
index 323367c..bda9a5e 100644
--- a/test/ElemWiseVectorMult/vecMul32_2/main.cpp
+++ b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/MuLaw/Compression/main.cpp b/test/DuRuInterface/MuLaw/Compression/main.cpp
similarity index 99%
rename from test/MuLaw/Compression/main.cpp
rename to test/DuRuInterface/MuLaw/Compression/main.cpp
index c1b8e6c..e107e1f 100644
--- a/test/MuLaw/Compression/main.cpp
+++ b/test/DuRuInterface/MuLaw/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/MuLaw/Decompression/main.cpp b/test/DuRuInterface/MuLaw/Decompression/main.cpp
similarity index 98%
rename from test/MuLaw/Decompression/main.cpp
rename to test/DuRuInterface/MuLaw/Decompression/main.cpp
index 067af2d..eb6ed52 100644
--- a/test/MuLaw/Decompression/main.cpp
+++ b/test/DuRuInterface/MuLaw/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/XRanBlockFloat/Compression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
similarity index 99%
rename from test/XRanBlockFloat/Compression/main.cpp
rename to test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
index 43824bd..ecf5d28 100644
--- a/test/XRanBlockFloat/Compression/main.cpp
+++ b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/XRanBlockFloat/Decompression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
similarity index 98%
rename from test/XRanBlockFloat/Decompression/main.cpp
rename to test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
index 4506d3b..087d3a0 100644
--- a/test/XRanBlockFloat/Decompression/main.cpp
+++ b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ORanBlockScaling/Compression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
similarity index 99%
rename from test/ORanBlockScaling/Compression/main.cpp
rename to test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
index ac0356c..72d680c 100644
--- a/test/ORanBlockScaling/Compression/main.cpp
+++ b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ORanBlockScaling/Decompression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
similarity index 98%
rename from test/ORanBlockScaling/Decompression/main.cpp
rename to test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
index 2174493..45cb5fe 100644
--- a/test/ORanBlockScaling/Decompression/main.cpp
+++ b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Correlation/main.cpp b/test/LowerPHY/Correlation/main.cpp
similarity index 97%
rename from test/Correlation/main.cpp
rename to test/LowerPHY/Correlation/main.cpp
index 192ba5f..fa31a8b 100644
--- a/test/Correlation/main.cpp
+++ b/test/LowerPHY/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/FFT/cs16/main.cpp b/test/LowerPHY/FFT/FFT16/main.cpp
similarity index 86%
rename from test/FFT/cs16/main.cpp
rename to test/LowerPHY/FFT/FFT16/main.cpp
index 6f88d57..33b4b98 100644
--- a/test/FFT/cs16/main.cpp
+++ b/test/LowerPHY/FFT/FFT16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -18,7 +18,7 @@
 
 float clamp_neg1_to_1(float x) {
   float low = -1.0;
-  float high = (float)((1 << 15) - 1) / (1 << 15);
+  float high = (float32_t)((1 << 15) - 1) / (1 << 15);
   return std::max(low, std::min(high, x));
 }
 
@@ -33,13 +33,13 @@ static bool check_fft_results(const char *name,
   float tol = FLT_EPSILON * (4 * n - 1);
   // since the final result is rounded to Q0.15 format, this is also a
   // potential source of large error (especially for smaller problem sizes).
-  tol = std::max((float)2 / (1 << 15), tol);
+  tol = std::max((float32_t)2 / (1 << 15), tol);
 
   for (uint32_t i = 0; i < n; ++i) {
-    auto res = std::complex<float>((float)result[i].re / (1 << 15),
-                                   (float)result[i].im / (1 << 15));
-    auto exp = std::complex<float>(clamp_neg1_to_1(expected[i].re),
-                                   clamp_neg1_to_1(expected[i].im));
+    auto res = std::complex<float32_t>((float32_t)result[i].re / (1 << 15),
+                                       (float32_t)result[i].im / (1 << 15));
+    auto exp = std::complex<float32_t>(clamp_neg1_to_1(expected[i].re),
+                                       clamp_neg1_to_1(expected[i].im));
     auto err = std::abs(res - exp);
     max_error = std::max(max_error, err);
     if (err > tol) {
diff --git a/test/FFT/cf32/main.cpp b/test/LowerPHY/FFT/FFT32/main.cpp
similarity index 92%
rename from test/FFT/cf32/main.cpp
rename to test/LowerPHY/FFT/FFT32/main.cpp
index ed8483c..bdec57c 100644
--- a/test/FFT/cf32/main.cpp
+++ b/test/LowerPHY/FFT/FFT32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -27,8 +27,8 @@ static bool check_fft_results(const char *name,
   float tol = FLT_EPSILON * (4 * n) * 10;
 
   for (uint32_t i = 0; i < n; ++i) {
-    std::complex<float> res = {result[i].re, result[i].im};
-    std::complex<float> exp = {expected[i].re, expected[i].im};
+    std::complex<float32_t> res = {result[i].re, result[i].im};
+    std::complex<float32_t> exp = {expected[i].re, expected[i].im};
     float err = std::abs(res - exp);
     max_error = std::max(err, max_error);
     if (err > tol) {
diff --git a/test/FIR/arm_fir_filter_cs16/main.cpp b/test/LowerPHY/FIR/FIR16/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cs16/main.cpp
rename to test/LowerPHY/FIR/FIR16/main.cpp
index 7103678..1cda4e8 100644
--- a/test/FIR/arm_fir_filter_cs16/main.cpp
+++ b/test/LowerPHY/FIR/FIR16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp
similarity index 95%
rename from test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
rename to test/LowerPHY/FIR/FIR16Decimate2/main.cpp
index a247b98..ab179e5 100644
--- a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
+++ b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cf32/main.cpp b/test/LowerPHY/FIR/FIR32/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cf32/main.cpp
rename to test/LowerPHY/FIR/FIR32/main.cpp
index c8c3643..2112b9f 100644
--- a/test/FIR/arm_fir_filter_cf32/main.cpp
+++ b/test/LowerPHY/FIR/FIR32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp
similarity index 94%
rename from test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
rename to test/LowerPHY/FIR/FIR32Decimate2/main.cpp
index 9c8c8e8..361e705 100644
--- a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
+++ b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/test/Scrambling/main.cpp b/test/LowerPHY/Scrambling/main.cpp
similarity index 93%
rename from test/Scrambling/main.cpp
rename to test/LowerPHY/Scrambling/main.cpp
index 777276f..36ab300 100644
--- a/test/Scrambling/main.cpp
+++ b/test/LowerPHY/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/SeqGenerator/main.cpp b/test/LowerPHY/SeqGenerator/main.cpp
similarity index 95%
rename from test/SeqGenerator/main.cpp
rename to test/LowerPHY/SeqGenerator/main.cpp
index 8bb2f61..70d50e6 100644
--- a/test/SeqGenerator/main.cpp
+++ b/test/LowerPHY/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/SVD/main.cpp b/test/MatrixFactorizations/SVD/main.cpp
similarity index 89%
rename from test/SVD/main.cpp
rename to test/MatrixFactorizations/SVD/main.cpp
index a142790..5fe0f85 100644
--- a/test/SVD/main.cpp
+++ b/test/MatrixFactorizations/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "svd_sample_data.h"
@@ -9,13 +9,13 @@
 namespace {
 
 // Routine for converting a vector of armral_cmplx_f32_t
-// to a vector of complex<float>.
-std::vector<std::complex<float>>
+// to a vector of complex<float32_t>.
+std::vector<std::complex<float32_t>>
 convert_arm_cf32_to_complex(uint16_t nvalues,
                             const std::vector<armral_cmplx_f32_t> &a) {
-  std::vector<std::complex<float>> out(nvalues);
+  std::vector<std::complex<float32_t>> out(nvalues);
   for (unsigned i = 0; i < nvalues; ++i) {
-    out[i] = std::complex<float>(a[i].re, a[i].im);
+    out[i] = std::complex<float32_t>(a[i].re, a[i].im);
   }
   return out;
 }
@@ -36,7 +36,7 @@ bool test_svd_with_sample(SVDFunction svd_function_under_test) {
     int m = test.m;
     int size = m * n;
     std::vector<armral_cmplx_f32_t> a = test.a;
-    std::vector<float> s(n);
+    std::vector<float32_t> s(n);
 
     // Left and right singular vectors.
     std::vector<armral_cmplx_f32_t> u(size);
@@ -51,7 +51,7 @@ bool test_svd_with_sample(SVDFunction svd_function_under_test) {
     // values computed.
     passed &= check_singular_values(n, test.s, s);
 
-    // Convert data to complex<float> for testing
+    // Convert data to complex<float32_t> for testing
     auto aref_cmplx = convert_arm_cf32_to_complex(size, test.a);
     auto u_cmplx = convert_arm_cf32_to_complex(size, u);
     auto vt_cmplx = convert_arm_cf32_to_complex(n * n, vt);
@@ -75,8 +75,8 @@ bool test_svd(bool gen_singular_vectors, int m, int n, float cond,
   // Generate test matrix with prescribed
   // singular values and condition number
   std::vector<armral_cmplx_f32_t> a(size);
-  std::vector<float> s(n);
-  std::vector<float> sref(n);
+  std::vector<float32_t> s(n);
+  std::vector<float32_t> sref(n);
   int seed = 0;
   generate_svd_matrix(m, n, a, sref, cond, seed);
 
@@ -99,7 +99,7 @@ bool test_svd(bool gen_singular_vectors, int m, int n, float cond,
   bool passed = check_singular_values(n, sref, s);
 
   if (gen_singular_vectors) {
-    // Convert data to complex<float> for testing
+    // Convert data to complex<float32_t> for testing
     auto aref_cmplx = convert_arm_cf32_to_complex(size, aref);
     auto u_cmplx = convert_arm_cf32_to_complex(size, u);
     auto vt_cmplx = convert_arm_cf32_to_complex(n * n, vt);
@@ -129,7 +129,7 @@ bool run_all_tests(char const *name, SVDFunction svd_function) {
   std::vector<int> nb_row = {32, 50, 64, 128};
   std::vector<int> nb_col = {4, 8, 16, 20, 28, 32};
   std::vector<bool> check_full_decomposition = {true, false};
-  std::vector<float> cond_number{4, 32, 100, 100, 1000, 10000};
+  std::vector<float32_t> cond_number{4, 32, 100, 100, 1000, 10000};
   for (auto m : nb_row) {
     for (auto n : nb_col) {
       for (auto cond : cond_number) {
diff --git a/test/SVD/svd_sample_data.h b/test/MatrixFactorizations/SVD/svd_sample_data.h
similarity index 93%
rename from test/SVD/svd_sample_data.h
rename to test/MatrixFactorizations/SVD/svd_sample_data.h
index 0add752..51accb3 100644
--- a/test/SVD/svd_sample_data.h
+++ b/test/MatrixFactorizations/SVD/svd_sample_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -16,8 +16,8 @@ struct svd_test_param_t {
   uint32_t m;
   uint32_t n;
   std::vector<armral_cmplx_f32_t> a;
-  std::vector<float> s; // singular values
-  float cond;
+  std::vector<float32_t> s; // singular values
+  float32_t cond;
 };
 
 std::vector<svd_test_param_t> svd_sample_tests = {
diff --git a/test/SVD/svd_test.hpp b/test/MatrixFactorizations/SVD/svd_test.hpp
similarity index 85%
rename from test/SVD/svd_test.hpp
rename to test/MatrixFactorizations/SVD/svd_test.hpp
index a3ca485..3cbcafb 100644
--- a/test/SVD/svd_test.hpp
+++ b/test/MatrixFactorizations/SVD/svd_test.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -12,8 +12,7 @@
 #include <tuple>
 #include <vector>
 
-#define SVD_TEST
-#include "SVD/matrix_view.hpp"
+#include "MatrixFactorizations/SVD/matrix_view.hpp"
 #include "cf32_utils.hpp"
 
 // In the accuracy tests, a computed solution
@@ -27,26 +26,26 @@
 // in division by a small floating point number.
 #define SAFEMIN 1.17549E-38
 
-typedef std::complex<float> cf32_t;
+typedef std::complex<float32_t> cf32_t;
 
 // Generate m-by-n, single complex random matrix
 static inline std::vector<cf32_t> generate_rand(const int m, const int n) {
 
   int size = m * n;
   std::vector<armral_cmplx_f32_t> a = allocate_random_cf32(size);
-  // Convert matrix to std::complex<float> type
+  // Convert matrix to std::complex<float32_t> type
   std::vector<cf32_t> out(size);
   for (int i = 0; i < size; ++i) {
-    out[i] = std::complex<float>(a[i].re, a[i].im);
+    out[i] = std::complex<float32_t>(a[i].re, a[i].im);
   }
   return out;
 }
 
-static inline float infinity_norm(int m, int n, const cf32_t *a) {
+static inline float32_t infinity_norm(int m, int n, const cf32_t *a) {
   column_major_matrix_view a_mat{a, m};
-  float inorm = 0;
+  float32_t inorm = 0;
   for (int i = 0; i < m; i++) {
-    float tmp = 0;
+    float32_t tmp = 0;
     for (int j = 0; j < n; j++) {
       tmp += std::abs(a_mat(i, j));
     }
@@ -59,7 +58,8 @@ static inline float infinity_norm(int m, int n, const cf32_t *a) {
 
 // Overload infinity_norm with an interface
 // with  std::vector data type as input
-static inline float infinity_norm(int m, int n, const std::vector<cf32_t> &a) {
+static inline float32_t infinity_norm(int m, int n,
+                                      const std::vector<cf32_t> &a) {
   return infinity_norm(m, n, a.data());
 }
 
@@ -72,7 +72,7 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
 
   cf32_t alpha = aii;
   // Sum of x[i] * conj(x[i])
-  float sum = 0.0F;
+  float32_t sum = 0.0F;
   for (int i = 0; i < n * incx; i += incx) {
     sum += real(x[i] * conj(x[i]));
   }
@@ -84,12 +84,12 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
   // Add alpha * conj(alpha) to sum
   // to compute the 2 norm of the full vector
   sum += real(alpha * conj(alpha));
-  float beta = -copysign(sqrt(sum), real(alpha));
-  float safmin = SAFEMIN / std::numeric_limits<float>::epsilon();
-  float rsafmin = 1.0F / safmin;
+  float32_t beta = -copysign(sqrt(sum), real(alpha));
+  float32_t safmin = SAFEMIN / std::numeric_limits<float32_t>::epsilon();
+  float32_t rsafmin = 1.0F / safmin;
   int cnt = 0;
   int max_attempt = 10;
-  float scale = 1.0F;
+  float32_t scale = 1.0F;
   // Check if beta is small enough to induce
   // overflow when taking the inverse, and
   // if it is the case, scale to avoid overflow
@@ -111,16 +111,16 @@ static inline cf32_t clarfg(const int n, cf32_t &aii, cf32_t *x,
 
   // Compute tau and update aii
   cf32_t tau = (beta - alpha) / beta;
-  cf32_t normalisation_factor = 1.0F / (alpha - beta);
+  cf32_t normalization_factor = 1.0F / (alpha - beta);
   for (int i = 0; i < n * incx; i += incx) {
-    x[i] = normalisation_factor * x[i];
+    x[i] = normalization_factor * x[i];
   }
   beta /= scale;
   aii = beta;
   return tau;
 }
 
-// householder_qr computes the QR factorisation A = QR.
+// householder_qr computes the QR factorization A = QR.
 // On exit, the elements on and above the diagonal
 // of the A contain the upper triangular matrix R.
 // The elements below the diagonal, with the array tau,
@@ -160,7 +160,7 @@ static inline void householder_qr(const int m, const int n, cf32_t *a,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the matrix A that has been factorized into QR or bidiagonalisation.
+// as the matrix A that has been factorized into QR or bidiagonalization.
 static inline void apply_q(int m, int n, const cf32_t *a, const cf32_t *tau,
                            cf32_t *c) {
   if (m < n) {
@@ -199,8 +199,8 @@ static inline void apply_q(int m, int n, const std::vector<cf32_t> &a,
   apply_q(m, n, a.data(), tau.data(), c.data());
 }
 
-// Generate explicitly Q from QR factorisation or from
-// the bidiagonalisation A = Q * B * P^H
+// Generate explicitly Q from QR factorization or from
+// the bidiagonalization A = Q * B * P^H
 static inline std::vector<cf32_t> get_q(const int m, const int n,
                                         const std::vector<cf32_t> &a,
                                         const std::vector<cf32_t> &tau) {
@@ -246,7 +246,7 @@ static inline std::vector<cf32_t> get_q(const int m, const int n,
 }
 
 // Generate the orthogonal matrix P from
-// the bidiagonalisation  A = Q * B * P^H,
+// the bidiagonalization  A = Q * B * P^H,
 // note that P^H is generated directly
 // instead of P
 static inline void get_p(int m, int n, const cf32_t *a, const cf32_t *tau,
@@ -322,18 +322,18 @@ static inline void get_p(int m, int n, const std::vector<cf32_t> &a,
 // singular values and condition number.
 // This routine first sets the singular values in
 // the array S, then generates two orthogonal matrices
-// Q1 and Q2 using QR factorisation, and form the
+// Q1 and Q2 using QR factorization, and form the
 // final matrix as Q 1* S * Q2.
 static inline void generate_svd_matrix(const int m, const int n,
                                        std::vector<armral_cmplx_f32_t> &a,
-                                       std::vector<float> &s, const float cond,
-                                       const int seed) {
+                                       std::vector<float32_t> &s,
+                                       const float32_t cond, const int seed) {
 
   // Generate singular values from 1 to 1/cond
   // where cond is the condition number of the matrix
   for (int i = 0; i < n; i++) {
-    float rcond = (1 - 1 / cond);
-    s[i] = 1 - (float)i / (n - 1) * rcond;
+    float32_t rcond = (1 - 1 / cond);
+    s[i] = 1 - (float32_t)i / (n - 1) * rcond;
   }
 
   srand(seed);
@@ -368,7 +368,7 @@ static inline void generate_svd_matrix(const int m, const int n,
   }
   apply_q(m, n, a1, tau1, a_cmplx);
 
-  // Convert vector<complex<float>> to vector<armral_cmplx_f32_t>
+  // Convert vector<complex<float32_t>> to vector<armral_cmplx_f32_t>
   for (int i = 0; i < m * n; ++i) {
     a[i] = {real(a_cmplx[i]), imag(a_cmplx[i])};
   }
@@ -389,9 +389,9 @@ static inline void generate_svd_matrix(const int m, const int n,
 // the bidiagonal matrix B. Note that this routine
 // returns directly the conjugate transpose of the
 // left orthogonal matrix.
-static inline void bidiagonalisation(const int m, const int n, cf32_t *a,
-                                     std::vector<float> &d,
-                                     std::vector<float> &e,
+static inline void bidiagonalization(const int m, const int n, cf32_t *a,
+                                     std::vector<float32_t> &d,
+                                     std::vector<float32_t> &e,
                                      std::vector<cf32_t> &tauq,
                                      std::vector<cf32_t> &taup) {
 
@@ -464,32 +464,33 @@ static inline void bidiagonalisation(const int m, const int n, cf32_t *a,
 }
 
 // Computation of Givens rotation components.
-inline static std::tuple<float, float, float> rotg(const float f,
-                                                   const float g) {
+inline static std::tuple<float32_t, float32_t, float32_t>
+rotg(const float32_t f, const float32_t g) {
   if (f == 0) {
-    float cs = 0.0F;
-    float sn = 1.0F;
+    float32_t cs = 0.0F;
+    float32_t sn = 1.0F;
     return std::make_tuple(cs, sn, g);
   }
   if (std::abs(f) > std::abs(g)) {
-    float t = g / f;
-    float tt = sqrt(1 + t * t);
-    float cs = 1 / tt;
-    float sn = t / tt;
+    float32_t t = g / f;
+    float32_t tt = sqrt(1 + t * t);
+    float32_t cs = 1 / tt;
+    float32_t sn = t / tt;
     return std::make_tuple(cs, sn, f * tt);
   }
-  float t = f / g;
-  float tt = sqrt(1 + t * t);
-  float sn = 1 / tt;
-  float cs = t / tt;
+  float32_t t = f / g;
+  float32_t tt = sqrt(1 + t * t);
+  float32_t sn = 1 / tt;
+  float32_t cs = t / tt;
   return std::make_tuple(cs, sn, g * tt);
 }
 
 // This routine updates singular vectors
 // by applying the Givens rotations
 // used to update the bidiagonal matrix
-inline static void update_sigvect(const int m, const float cs, const float sn,
-                                  cf32_t *v1, cf32_t *v2, const int incv) {
+inline static void update_sigvect(const int m, const float32_t cs,
+                                  const float32_t sn, cf32_t *v1, cf32_t *v2,
+                                  const int incv) {
   for (int i = 0; i < m * incv; i += incv) {
     cf32_t t = v1[i];
     v1[i] = cs * t + sn * v2[i];
@@ -510,9 +511,9 @@ inline static void update_sigvect(const int m, const float cs, const float sn,
 // "Singular Value Decomposition and Least Squares Solutions"
 //  published in Numer. Math. 14, 403--420 (1970).
 inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
-                                 const int n, std::vector<float> &d,
-                                 std::vector<float> &e, cf32_t *u, cf32_t *vt,
-                                 const int u_stride) {
+                                 const int n, std::vector<float32_t> &d,
+                                 std::vector<float32_t> &e, cf32_t *u,
+                                 cf32_t *vt, const int u_stride) {
 
   if (m < n) {
     // GCOVR_EXCL_START
@@ -530,14 +531,14 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
 
   // Compute the 1-norm of the bidiagonal matrix
   // for the computation of the stopping criteria.
-  float anorm = 0;
+  float32_t anorm = 0;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(d[i]) + std::abs(e[i]);
+    float32_t tmp = std::abs(d[i]) + std::abs(e[i]);
     if (anorm < tmp) {
       anorm = tmp;
     }
   }
-  float tol = THRESHOLD * anorm * std::numeric_limits<float>::epsilon();
+  float32_t tol = THRESHOLD * anorm * std::numeric_limits<float32_t>::epsilon();
 
   int maxiter = 2 * n;
   // Loop over the columns
@@ -567,16 +568,16 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
       // In this case, an extra sequence of Givens rotations is
       // applied from the left to annihilate the off-diagonal E[next_col].
       if (diag_is_zero) {
-        float cs = 0.0;
-        float sn = 1.0;
+        float32_t cs = 0.0;
+        float32_t sn = 1.0;
         for (int i = next_col; i < curr_col; i++) {
-          float f = sn * e[i];
+          float32_t f = sn * e[i];
           e[i] *= cs;
           if (std::abs(f) <= tol) {
             break;
           }
-          float g = d[i];
-          float h;
+          float32_t g = d[i];
+          float32_t h;
           std::tie(cs, sn, h) = rotg(f, g);
           d[i] = h;
           // Update left singular vectors.
@@ -586,7 +587,7 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
           }
         }
       }
-      float z = d[curr_col];
+      float32_t z = d[curr_col];
       if (next_col == curr_col) {
         // Make singular value nonnegative and update
         // the corresponding right singular vectors.
@@ -618,20 +619,20 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
       // the 2 eigenvalues are (d1 + d2)/2 +/- sqrt(((d1 - d2)/2)^2 + e1^2).
       // The choice of this shift accelerates the convergence of the
       // most bottom off-diagonal E[curr_col] to zero.
-      float x = d[next_col];
-      float y = d[curr_col - 1];
-      float g = e[curr_col - 1];
-      float h = e[curr_col];
+      float32_t x = d[next_col];
+      float32_t y = d[curr_col - 1];
+      float32_t g = e[curr_col - 1];
+      float32_t h = e[curr_col];
       // a^2 - b^2 operations are computed as
       // (a - b)* (a + b) to avoid overflow.
-      float f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      float32_t f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
       g = sqrt(f * f + 1);
       f = ((x - z) * (x + z) + h * (y / (f + copysign(g, f)) - h)) / x;
 
       // Shifted QR iteration, bulge chasing, applying
       // successive Givens rotations from right then from left.
-      float c = 1.0F;
-      float s = 1.0F;
+      float32_t c = 1.0F;
+      float32_t s = 1.0F;
       for (int i = next_col + 1; i <= curr_col; i++) {
         g = e[i];
         y = d[i];
@@ -693,18 +694,18 @@ inline static int svd_bidiagonal(const bool gen_singular_vectors, const int m,
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A by first performing
-// the bidigonalisation of A, then computing
+// the bidigonalization of A, then computing
 // the SVD of the bidiagonal matrix and update
 // the singular vectors if required.
 static inline int svd_cf32(bool gen_singular_vect, const int m, const int n,
-                           std::vector<cf32_t> &a, std::vector<float> &s,
+                           std::vector<cf32_t> &a, std::vector<float32_t> &s,
                            std::vector<cf32_t> &u, std::vector<cf32_t> &vt) {
 
-  // Bidiagonalisation: A = Q * B * P^H.
+  // Bidiagonalization: A = Q * B * P^H.
   std::vector<cf32_t> tauq(n);
   std::vector<cf32_t> taup(n);
-  std::vector<float> e(n);
-  bidiagonalisation(m, n, a.data(), s, e, tauq, taup);
+  std::vector<float32_t> e(n);
+  bidiagonalization(m, n, a.data(), s, e, tauq, taup);
 
   // Generate left and right orthogonal vectors if required.
   if (gen_singular_vect) {
@@ -722,14 +723,14 @@ static inline int svd_cf32(bool gen_singular_vect, const int m, const int n,
 
 // armral_svd computes the SVD decomposition
 // of an m-by-n matrix A in 4 steps.
-// 1- QR factorisation of A.
-// 2- Bidiagonalisation of R.
+// 1- QR factorization of A.
+// 2- Bidiagonalization of R.
 // 3- SVD of the bidigonal matrix from R.
 // 4- Update of the left singular vectors
 // with the orthogonal matrix from QR.
 static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
                               const int n, std::vector<cf32_t> &a,
-                              std::vector<float> &s, std::vector<cf32_t> &u,
+                              std::vector<float32_t> &s, std::vector<cf32_t> &u,
                               std::vector<cf32_t> &vt) {
   column_major_matrix_view a_mat{a.data(), m};
 
@@ -745,18 +746,18 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
       r_mat(i, j) = a_mat(i, j);
     }
   }
-  // Bidiagonalisation of R.
+  // Bidiagonalization of R.
   std::vector<cf32_t> tauq(n);
   std::vector<cf32_t> taup(n);
-  std::vector<float> e(n);
-  bidiagonalisation(n, n, r.data(), s, e, tauq, taup);
+  std::vector<float32_t> e(n);
+  bidiagonalization(n, n, r.data(), s, e, tauq, taup);
 
   // Generate left and right orthogonal vectors.
   if (gen_singular_vect) {
     // Generate Q, and store it in u1.
     std::vector<cf32_t> u1 = get_q(n, n, r, tauq);
     // Copy u1 in u
-    // Initialise u to zero in case it is not.
+    // Initialize u to zero in case it is not.
     u.assign(u.size(), 0.0F);
     column_major_matrix_view u_mat{u.data(), m};
     column_major_matrix_view u1_mat{u1.data(), n};
@@ -782,7 +783,7 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
 // Check ||Id - Q^H * Q||_∞/n < THRESHOLD * epsilon
 static inline bool check_orthogonality(const int m, const int n, cf32_t *q) {
 
-  float tol = THRESHOLD * n * std::numeric_limits<float>::epsilon();
+  float32_t tol = THRESHOLD * n * std::numeric_limits<float32_t>::epsilon();
 
   // Build an identity matrix Id
   std::vector<cf32_t> a(n * n);
@@ -800,7 +801,7 @@ static inline bool check_orthogonality(const int m, const int n, cf32_t *q) {
     }
   }
   // Compute the infinity norm ||Id - Q^H * Q||_∞
-  float inorm = infinity_norm(n, n, a);
+  float32_t inorm = infinity_norm(n, n, a);
   return inorm < tol;
 }
 
@@ -819,9 +820,10 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
   column_major_matrix_view a_mat{a, m};
 
   // Infinity norm of Aref
-  float anorm = infinity_norm(m, n, aref);
+  float32_t anorm = infinity_norm(m, n, aref);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // Extract R, allocate m-by-n memory for
   // the multiplication by A later
@@ -847,7 +849,7 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
     }
   }
   // Compute the norm of Aref - Q * R
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -890,16 +892,16 @@ static inline void matmul(int m, int n, int k, const std::vector<cf32_t> &a,
 // Check || A - Q * B * P^H||_∞/ || A ||_∞ < tol
 // where B is an upper bidiagonal matrix with diagonal
 // entries in D, and superdiagonal entries in E.
-static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
-                                              const cf32_t *a, const float *d,
-                                              const float *e,
-                                              const cf32_t *tauq,
-                                              const cf32_t *taup) {
+static inline bool
+check_bidiag_decomposition(int m, int n, const cf32_t *aref, const cf32_t *a,
+                           const float32_t *d, const float32_t *e,
+                           const cf32_t *tauq, const cf32_t *taup) {
 
   // Infinity norm of aref
-  float anorm = infinity_norm(m, n, aref);
+  float32_t anorm = infinity_norm(m, n, aref);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // Generate right orthogonal vectors
 
@@ -931,7 +933,7 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
     }
   }
   // Infinity norm of error
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -939,19 +941,19 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
 // with std::vector data type as input
 static inline bool check_bidiag_decomposition(
     int m, int n, const std::vector<cf32_t> &aref, const std::vector<cf32_t> &a,
-    const std::vector<float> &d, const std::vector<float> &e,
+    const std::vector<float32_t> &d, const std::vector<float32_t> &e,
     const std::vector<cf32_t> &tauq, const std::vector<cf32_t> &taup) {
 
   return check_bidiag_decomposition(m, n, aref.data(), a.data(), d.data(),
                                     e.data(), tauq.data(), taup.data());
 }
 
-static inline bool check_singular_values(int n, const float *sref,
-                                         const float *s) {
-  float tol = THRESHOLD * n * std::numeric_limits<float>::epsilon();
-  float error = 0.0F;
+static inline bool check_singular_values(int n, const float32_t *sref,
+                                         const float32_t *s) {
+  float32_t tol = THRESHOLD * n * std::numeric_limits<float32_t>::epsilon();
+  float32_t error = 0.0F;
   for (int i = 0; i < n; i++) {
-    float tmp = std::abs(sref[i] - s[i]);
+    float32_t tmp = std::abs(sref[i] - s[i]);
     if (tmp > error) {
       error = tmp;
     }
@@ -961,21 +963,23 @@ static inline bool check_singular_values(int n, const float *sref,
 
 // Overload check_singular_values with an interface
 // with std::vector data type as inputs
-static inline bool check_singular_values(int n, const std::vector<float> &sref,
-                                         const std::vector<float> &s) {
+static inline bool check_singular_values(int n,
+                                         const std::vector<float32_t> &sref,
+                                         const std::vector<float32_t> &s) {
   return check_singular_values(n, sref.data(), s.data());
 }
 
 // Check the accuracy of SVD decomposition
 //  error = ||A - U * S *VT^H||_∞/ (||A||_∞ * m)
 static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
-                                           const float *s, const cf32_t *u,
+                                           const float32_t *s, const cf32_t *u,
                                            const cf32_t *vt) {
 
   // Infinity norm of A
-  float anorm = infinity_norm(m, n, a);
+  float32_t anorm = infinity_norm(m, n, a);
   anorm = anorm > 1 ? anorm : 1;
-  float tol = THRESHOLD * anorm * m * std::numeric_limits<float>::epsilon();
+  float32_t tol =
+      THRESHOLD * anorm * m * std::numeric_limits<float32_t>::epsilon();
 
   // U1 = U * S
   std::vector<cf32_t> u1(m * n);
@@ -994,7 +998,7 @@ static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
   matmul(m, n, n, u1.data(), vt, -1.0F, c.data());
 
   // Compute the infinity norm ||A  - U * S * VT^H||_oo
-  float error = infinity_norm(m, n, c);
+  float32_t error = infinity_norm(m, n, c);
   return error < tol;
 }
 
@@ -1002,7 +1006,7 @@ static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
 // with std::vector data type as inputs
 static inline bool check_svd_decomposition(int m, int n,
                                            const std::vector<cf32_t> &a,
-                                           const std::vector<float> &s,
+                                           const std::vector<float32_t> &s,
                                            const std::vector<cf32_t> &u,
                                            const std::vector<cf32_t> &vt) {
   return check_svd_decomposition(m, n, a.data(), s.data(), u.data(), vt.data());
diff --git a/test/CRC/main.cpp b/test/UpperPHY/CRC/main.cpp
similarity index 97%
rename from test/CRC/main.cpp
rename to test/UpperPHY/CRC/main.cpp
index ed3941a..47a4d9b 100644
--- a/test/CRC/main.cpp
+++ b/test/UpperPHY/CRC/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ConvCoding/decoding/main.cpp b/test/UpperPHY/ConvolutionalDecoder/main.cpp
similarity index 97%
rename from test/ConvCoding/decoding/main.cpp
rename to test/UpperPHY/ConvolutionalDecoder/main.cpp
index d768fbe..dcebd77 100644
--- a/test/ConvCoding/decoding/main.cpp
+++ b/test/UpperPHY/ConvolutionalDecoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/ConvCoding/encoding/main.cpp b/test/UpperPHY/ConvolutionalEncoder/main.cpp
similarity index 96%
rename from test/ConvCoding/encoding/main.cpp
rename to test/UpperPHY/ConvolutionalEncoder/main.cpp
index fab64d1..640bd1f 100644
--- a/test/ConvCoding/encoding/main.cpp
+++ b/test/UpperPHY/ConvolutionalEncoder/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/Demodulation/main.cpp b/test/UpperPHY/Demodulation/main.cpp
similarity index 97%
rename from test/Demodulation/main.cpp
rename to test/UpperPHY/Demodulation/main.cpp
index 4833b65..bfc68be 100644
--- a/test/Demodulation/main.cpp
+++ b/test/UpperPHY/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
@@ -108,7 +108,7 @@ static void demodulation_64qam_ref(const uint32_t n_symbols, const uint16_t ulp,
   // proportional to an offset of the modulated symbol received
   uint16_t weight = (1 << 15) / ulp;
   // The amplitudes are in {1, 3, 5, 7} / sqrt(42)
-  // These are organised using a Gray encoding, in the following manner
+  // These are organized using a Gray encoding, in the following manner
   // 01 -> 1/sqrt(42)
   // 00 -> 3/sqrt(42)
   // 10 -> 5/sqrt(42)
@@ -148,7 +148,7 @@ static void demodulation_256qam_ref(const uint32_t n_symbols,
   uint16_t weight = (1 << 15) / ulp;
 
   // The amplitudes are in {1, 3, 5, 7, 9, 11, 13, 15} / sqrt(170)
-  // These are organised in a Gray encoding, and we can get the
+  // These are organized in a Gray encoding, and we can get the
   // log-likelihood ratios by performing the following operations
   // for each of the 8 bits encoded in the symbol s = {s.re, s.im}
   // LLR(b0|s) = weight * -s.re
diff --git a/test/LDPC/decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp
similarity index 98%
rename from test/LDPC/decoding/main.cpp
rename to test/UpperPHY/LDPC/Decoding/main.cpp
index 9362a05..f557ea2 100644
--- a/test/LDPC/decoding/main.cpp
+++ b/test/UpperPHY/LDPC/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "../ldpc_test_common.hpp"
diff --git a/test/LDPC/encoding/ldpc_encoding_test_data.h b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
similarity index 98%
rename from test/LDPC/encoding/ldpc_encoding_test_data.h
rename to test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
index 09947ba..4a94f7a 100644
--- a/test/LDPC/encoding/ldpc_encoding_test_data.h
+++ b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/LDPC/encoding/main.cpp b/test/UpperPHY/LDPC/Encoding/main.cpp
similarity index 98%
rename from test/LDPC/encoding/main.cpp
rename to test/UpperPHY/LDPC/Encoding/main.cpp
index 8b78ea4..59ba4e3 100644
--- a/test/LDPC/encoding/main.cpp
+++ b/test/UpperPHY/LDPC/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "../ldpc_test_common.hpp"
 #include "armral.h"
@@ -195,7 +195,7 @@ std::vector<uint8_t> armral_ldpc_encode_block_ref(const uint8_t *data_in,
   // Perform the encoding
   // We need to invert a system of equations for the
   // first 4 * z rows, which correspond to the high-density
-  // sub-matrix portion. Initialisation is to zero
+  // sub-matrix portion. Initialization is to zero
   std::vector<uint8_t> parity_hdsm(4 * z);
 
   // Rename a variable for clarity how it is used in this function
diff --git a/test/LDPC/rate_matching/main.cpp b/test/UpperPHY/LDPC/RateMatching/main.cpp
similarity index 99%
rename from test/LDPC/rate_matching/main.cpp
rename to test/UpperPHY/LDPC/RateMatching/main.cpp
index 783c882..ceb7267 100644
--- a/test/LDPC/rate_matching/main.cpp
+++ b/test/UpperPHY/LDPC/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/LDPC/rate_recovery/main.cpp b/test/UpperPHY/LDPC/RateRecovery/main.cpp
similarity index 99%
rename from test/LDPC/rate_recovery/main.cpp
rename to test/UpperPHY/LDPC/RateRecovery/main.cpp
index 499d98b..993b08e 100644
--- a/test/LDPC/rate_recovery/main.cpp
+++ b/test/UpperPHY/LDPC/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
@@ -215,7 +215,6 @@ bool test_ref_rate_recovery() {
   passed &= std::equal(out.begin(), out.begin() + k0, in.begin() + (e - k0));
 
   // Test selection process with shortening
-
   e = 80;
   n = 100;
   k0 = 16;
diff --git a/test/LDPC/ldpc_test_common.hpp b/test/UpperPHY/LDPC/ldpc_test_common.hpp
similarity index 95%
rename from test/LDPC/ldpc_test_common.hpp
rename to test/UpperPHY/LDPC/ldpc_test_common.hpp
index 0623f9f..2b8d4a9 100644
--- a/test/LDPC/ldpc_test_common.hpp
+++ b/test/UpperPHY/LDPC/ldpc_test_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/test/Modulation/main.cpp b/test/UpperPHY/Modulation/main.cpp
similarity index 99%
rename from test/Modulation/main.cpp
rename to test/UpperPHY/Modulation/main.cpp
index 0cb0a3a..2fff2b6 100644
--- a/test/Modulation/main.cpp
+++ b/test/UpperPHY/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/Polar/crc_attachment/main.cpp b/test/UpperPHY/Polar/CrcAttachment/main.cpp
similarity index 93%
rename from test/Polar/crc_attachment/main.cpp
rename to test/UpperPHY/Polar/CrcAttachment/main.cpp
index 8d67cd4..21f1f71 100644
--- a/test/Polar/crc_attachment/main.cpp
+++ b/test/UpperPHY/Polar/CrcAttachment/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "int8_utils.hpp"
 #include "polar_crc_attach_data.hpp"
diff --git a/test/Polar/crc_attachment/polar_crc_attach_data.hpp b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
similarity index 85%
rename from test/Polar/crc_attachment/polar_crc_attach_data.hpp
rename to test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
index cb8c986..555f4af 100644
--- a/test/Polar/crc_attachment/polar_crc_attach_data.hpp
+++ b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Polar/decoding/main.cpp b/test/UpperPHY/Polar/Decoding/main.cpp
similarity index 97%
rename from test/Polar/decoding/main.cpp
rename to test/UpperPHY/Polar/Decoding/main.cpp
index 5b36846..e6f48ac 100644
--- a/test/Polar/decoding/main.cpp
+++ b/test/UpperPHY/Polar/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/encoding/main.cpp b/test/UpperPHY/Polar/Encoding/main.cpp
similarity index 92%
rename from test/Polar/encoding/main.cpp
rename to test/UpperPHY/Polar/Encoding/main.cpp
index 7c1d9ac..ae53d57 100644
--- a/test/Polar/encoding/main.cpp
+++ b/test/UpperPHY/Polar/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/frozen/main.cpp b/test/UpperPHY/Polar/Frozen/main.cpp
similarity index 98%
rename from test/Polar/frozen/main.cpp
rename to test/UpperPHY/Polar/Frozen/main.cpp
index 5be4671..341383f 100644
--- a/test/Polar/frozen/main.cpp
+++ b/test/UpperPHY/Polar/Frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_matching/main.cpp b/test/UpperPHY/Polar/RateMatching/main.cpp
similarity index 98%
rename from test/Polar/rate_matching/main.cpp
rename to test/UpperPHY/Polar/RateMatching/main.cpp
index 6afd323..4bbc017 100644
--- a/test/Polar/rate_matching/main.cpp
+++ b/test/UpperPHY/Polar/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_recovery/main.cpp b/test/UpperPHY/Polar/RateRecovery/main.cpp
similarity index 98%
rename from test/Polar/rate_recovery/main.cpp
rename to test/UpperPHY/Polar/RateRecovery/main.cpp
index fe3ce7d..8d7d5b7 100644
--- a/test/Polar/rate_recovery/main.cpp
+++ b/test/UpperPHY/Polar/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/subchannel_deinterleave/main.cpp b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
similarity index 92%
rename from test/Polar/subchannel_deinterleave/main.cpp
rename to test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
index 59b3ef8..0b0840f 100644
--- a/test/Polar/subchannel_deinterleave/main.cpp
+++ b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/subchannel_interleave/main.cpp b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp
similarity index 95%
rename from test/Polar/subchannel_interleave/main.cpp
rename to test/UpperPHY/Polar/SubchannelInterleave/main.cpp
index bfc6e55..4a83cb0 100644
--- a/test/Polar/subchannel_interleave/main.cpp
+++ b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/decoding/main.cpp b/test/UpperPHY/Turbo/Decoding/main.cpp
similarity index 96%
rename from test/Turbo/decoding/main.cpp
rename to test/UpperPHY/Turbo/Decoding/main.cpp
index 027056e..af4d929 100644
--- a/test/Turbo/decoding/main.cpp
+++ b/test/UpperPHY/Turbo/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -11,7 +11,7 @@
 #include <vector>
 
 // Check that the decoder returns the expected error code when
-// passed an invalid value of k. We can safely pass uninitialised
+// passed an invalid value of k. We can safely pass uninitialized
 // memory to the routine as the parameter test is the first thing
 // it does and it will return immediately when k is invalid.
 static bool run_turbo_decoding_parameter_test() {
diff --git a/test/Turbo/encoding/main.cpp b/test/UpperPHY/Turbo/Encoding/main.cpp
similarity index 95%
rename from test/Turbo/encoding/main.cpp
rename to test/UpperPHY/Turbo/Encoding/main.cpp
index b945f22..072218c 100644
--- a/test/Turbo/encoding/main.cpp
+++ b/test/UpperPHY/Turbo/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -12,7 +12,7 @@
 #include <vector>
 
 // Check that the encoder returns the expected error code when
-// passed an invalid value of k. We can safely pass uninitialised
+// passed an invalid value of k. We can safely pass uninitialized
 // memory to the routine as the parameter test is the first thing
 // it does and it will return immediately when k is invalid.
 static bool run_turbo_encoding_parameter_test() {
diff --git a/test/Turbo/encoding/reference_turbo_encoder.hpp b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
similarity index 99%
rename from test/Turbo/encoding/reference_turbo_encoder.hpp
rename to test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
index aec3668..451b6b2 100644
--- a/test/Turbo/encoding/reference_turbo_encoder.hpp
+++ b/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/rate_matching/main.cpp b/test/UpperPHY/Turbo/RateMatching/main.cpp
similarity index 99%
rename from test/Turbo/rate_matching/main.cpp
rename to test/UpperPHY/Turbo/RateMatching/main.cpp
index be9c29f..4353ab5 100644
--- a/test/Turbo/rate_matching/main.cpp
+++ b/test/UpperPHY/Turbo/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/main.cpp b/test/UpperPHY/Turbo/RateRecovery/main.cpp
similarity index 94%
rename from test/Turbo/rate_recovery/main.cpp
rename to test/UpperPHY/Turbo/RateRecovery/main.cpp
index 36c748b..5f91d3a 100644
--- a/test/Turbo/rate_recovery/main.cpp
+++ b/test/UpperPHY/Turbo/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/rate_recovery_data.hpp b/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
similarity index 99%
rename from test/Turbo/rate_recovery/rate_recovery_data.hpp
rename to test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
index 0117f14..361289e 100644
--- a/test/Turbo/rate_recovery/rate_recovery_data.hpp
+++ b/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp
similarity index 89%
rename from test/Turbo/turbo_test_data.hpp
rename to test/UpperPHY/Turbo/turbo_test_data.hpp
index cc47d69..4507b9b 100644
--- a/test/Turbo/turbo_test_data.hpp
+++ b/test/UpperPHY/Turbo/turbo_test_data.hpp
@@ -1,14 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include "rng.hpp"
 
 static void generate_turbo_test_data(uint8_t *src, uint32_t k) {
-  static linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({k});
+  static armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({k});
 
   // k is always divisible by 8
   uint32_t k_bytes = k >> 3;
diff --git a/utils/bit_utils.hpp b/utils/bit_utils.hpp
index 0de9b1c..1ed60cf 100644
--- a/utils/bit_utils.hpp
+++ b/utils/bit_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -97,7 +97,7 @@ static inline void bytes_to_bits(uint32_t n, const uint8_t *in, uint8_t *out) {
 
 // Loop through all of the llrs, and set the corresponding bit to 1 if LLR is
 // negative, otherwise to 0. We do not assume that the data_out pointer is
-// initialised
+// initialized
 template<typename T>
 static inline void llrs_to_bits(uint32_t n, const T *llr, uint8_t *data_out) {
   uint32_t full_bytes = n >> 3;
diff --git a/utils/cf32_utils.hpp b/utils/cf32_utils.hpp
index 41f2c4f..f54c10e 100644
--- a/utils/cf32_utils.hpp
+++ b/utils/cf32_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -43,15 +43,15 @@
 class cf32_random {
 public:
   cf32_random(std::initializer_list<uint64_t> seeds = {42})
-    : m_state(random_state::from_seeds(seeds)) {}
+    : m_state(armral::utils::random_state::from_seeds(seeds)) {}
 
   static constexpr armral_cmplx_f32_t default_min{1.0F, 2.0F};
   static constexpr armral_cmplx_f32_t default_max{2.0F, 4.0F};
 
   armral_cmplx_f32_t one(armral_cmplx_f32_t min = default_min,
                          armral_cmplx_f32_t max = default_max) {
-    return armral_cmplx_f32_t{rand<float>(min.re, max.re),
-                              rand<float>(min.im, max.im)};
+    return armral_cmplx_f32_t{rand<float32_t>(min.re, max.re),
+                              rand<float32_t>(min.im, max.im)};
   }
 
   std::vector<armral_cmplx_f32_t> vector(size_t len,
@@ -65,11 +65,11 @@ public:
   }
 
   std::vector<armral_cmplx_f32_t> &
-  flip_signs(std::vector<armral_cmplx_f32_t> &vector, float chance_re = 0.5F,
-             float chance_im = 0.5F) {
+  flip_signs(std::vector<armral_cmplx_f32_t> &vector,
+             float32_t chance_re = 0.5F, float32_t chance_im = 0.5F) {
     for (auto &cmplx : vector) {
-      bool re_flip = rand<float>(0, 1) < chance_re;
-      bool im_flip = rand<float>(0, 1) < chance_im;
+      bool re_flip = rand<float32_t>(0, 1) < chance_re;
+      bool im_flip = rand<float32_t>(0, 1) < chance_im;
       cmplx.re = re_flip ? -cmplx.re : cmplx.re;
       cmplx.im = im_flip ? -cmplx.im : cmplx.im;
     }
@@ -77,20 +77,20 @@ public:
   }
 
   std::vector<armral_cmplx_f32_t>
-  flip_signs(std::vector<armral_cmplx_f32_t> &&vector, float chance_re = 0.5F,
-             float chance_im = 0.5F) {
+  flip_signs(std::vector<armral_cmplx_f32_t> &&vector,
+             float32_t chance_re = 0.5F, float32_t chance_im = 0.5F) {
     auto result = std::move(vector);
     return flip_signs(result);
   }
 
 private:
   template<typename T>
-  float rand(float min, float max) {
-    linear_congruential_generator lcg;
+  float32_t rand(float32_t min, float32_t max) {
+    armral::utils::linear_congruential_generator lcg;
     return lcg.one<T>(&m_state, min, max);
   }
 
-  random_state m_state;
+  armral::utils::random_state m_state;
 };
 
 static inline std::vector<armral_cmplx_f32_t>
@@ -126,7 +126,7 @@ narrow_to_cf32(const std::vector<std::complex<double>> &a) {
 }
 
 static inline std::vector<armral_cmplx_f32_t>
-pack_cf32(const std::vector<float> &re, const std::vector<float> &im) {
+pack_cf32(const std::vector<float32_t> &re, const std::vector<float32_t> &im) {
   assert(re.size() == im.size());
   std::vector<armral_cmplx_f32_t> ret(re.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
@@ -135,18 +135,18 @@ pack_cf32(const std::vector<float> &re, const std::vector<float> &im) {
   return ret;
 }
 
-static inline std::vector<float>
+static inline std::vector<float32_t>
 unpack_real_cf32(const std::vector<armral_cmplx_f32_t> &in) {
-  std::vector<float> ret(in.size());
+  std::vector<float32_t> ret(in.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
     ret[i] = in[i].re;
   }
   return ret;
 }
 
-static inline std::vector<float>
+static inline std::vector<float32_t>
 unpack_imag_cf32(const std::vector<armral_cmplx_f32_t> &in) {
-  std::vector<float> ret(in.size());
+  std::vector<float32_t> ret(in.size());
   for (unsigned i = 0; i < ret.size(); ++i) {
     ret[i] = in[i].im;
   }
@@ -172,26 +172,27 @@ unpack_imag_cf32(const std::vector<armral_cmplx_f32_t> &in) {
  *
  * Returns true if the elements match elementwise, within tolerance.
  */
-static inline bool check_results_cf32(const char *name, const float *result,
-                                      const float *expected, uint32_t n,
+static inline bool check_results_cf32(const char *name, const float32_t *result,
+                                      const float32_t *expected, uint32_t n,
                                       uint32_t op_count = 400) {
   bool passed = true;
-  float max_error = 0;
-  float diff_at_max_error = 0;
-  float max_diff = 0;
-  float error_at_max_diff = 0;
+  float32_t max_error = 0;
+  float32_t diff_at_max_error = 0;
+  float32_t max_diff = 0;
+  float32_t error_at_max_diff = 0;
 
-  float relative_tol = op_count * std::numeric_limits<float>::epsilon();
+  float32_t relative_tol = op_count * std::numeric_limits<float32_t>::epsilon();
 
   // This is an arbitrarily chosen constant.
   // In the future, we would like to tighten the error bounds, which requires
   // problem-specific information, as well as restrictions on input values and
   // sizes.
-  float abs_tol = 0.000015;
+  float32_t abs_tol = 0.000015;
 
   for (uint32_t i = 0; i < n; ++i) {
-    float diff = fabs(result[i] - expected[i]);
-    float err = expected[i] != 0 ? fabs(diff / expected[i]) : fabs(result[i]);
+    float32_t diff = fabs(result[i] - expected[i]);
+    float32_t err =
+        expected[i] != 0 ? fabs(diff / expected[i]) : fabs(result[i]);
     if (err > max_error) {
       max_error = err;
       diff_at_max_error = diff;
@@ -262,6 +263,6 @@ static inline bool check_results_cf32(const char *name,
                                       const armral_cmplx_f32_t *result,
                                       const armral_cmplx_f32_t *expected,
                                       uint32_t n, uint32_t op_count = 400) {
-  return check_results_cf32(name, (const float *)result,
-                            (const float *)expected, n * 2, op_count);
+  return check_results_cf32(name, (const float32_t *)result,
+                            (const float32_t *)expected, n * 2, op_count);
 }
diff --git a/utils/cs16_utils.hpp b/utils/cs16_utils.hpp
index 6824f3c..e4794d6 100644
--- a/utils/cs16_utils.hpp
+++ b/utils/cs16_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/fft_utils.hpp b/utils/fft_utils.hpp
index c99371c..c34e259 100644
--- a/utils/fft_utils.hpp
+++ b/utils/fft_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/int8_utils.hpp b/utils/int8_utils.hpp
index ec5d103..bc5bbdc 100644
--- a/utils/int8_utils.hpp
+++ b/utils/int8_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/matrix_utils.hpp b/utils/matrix_utils.hpp
index d15e31e..e3a5d0c 100644
--- a/utils/matrix_utils.hpp
+++ b/utils/matrix_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -14,13 +14,13 @@
  */
 static inline std::vector<armral_cmplx_f32_t>
 allocate_random_cf32_lin_ind(uint32_t len) {
-  static linear_congruential_generator lcg;
-  auto state = random_state::from_seeds({42});
+  static armral::utils::linear_congruential_generator lcg;
+  auto state = armral::utils::random_state::from_seeds({42});
 
   std::vector<armral_cmplx_f32_t> ret(len);
   for (uint32_t i = 0; i < len; ++i) {
-    ret[i].re = lcg.one<float>(&state, -100., 100.);
-    ret[i].im = lcg.one<float>(&state, -100., 100.);
+    ret[i].re = lcg.one<float32_t>(&state, -100., 100.);
+    ret[i].im = lcg.one<float32_t>(&state, -100., 100.);
   }
   return ret;
 }
@@ -29,8 +29,8 @@ allocate_random_cf32_lin_ind(uint32_t len) {
  * Generate random invertible matrices.
  */
 static inline std::vector<armral_cmplx_f32_t>
-gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
-                      float scale_im = 1.0F) {
+gen_invertible_matrix(uint32_t m, float32_t scale_re = 1.0F,
+                      float32_t scale_im = 1.0F) {
 
   auto a = allocate_random_cf32_lin_ind(m * m);
 
@@ -38,15 +38,16 @@ gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
   // It is non-singular with high probability by virtue of sampling randomly.
 
   // If real-part is zeroed-out increase fac to avoid det(a)=0
-  float fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
+  float32_t fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
 
   for (unsigned i = 0; i < m; ++i) {
     // force non-negative diagonal entries
     a[i * m + i].re = std::abs(a[i * m + i].re);
     for (unsigned j = 0; j < m; ++j) {
       if (i != j) {
-        a[i * m + i].re += fac * std::abs(std::complex<float>(a[i * m + j].re,
-                                                              a[i * m + j].im));
+        a[i * m + i].re +=
+            fac *
+            std::abs(std::complex<float32_t>(a[i * m + j].re, a[i * m + j].im));
       }
     }
   }
@@ -58,7 +59,8 @@ gen_invertible_matrix(uint32_t m, float scale_re = 1.0F,
  */
 static inline std::vector<armral_cmplx_f32_t>
 gen_invertible_matrix_batch(uint32_t batch_size, uint32_t m,
-                            float scale_re = 1.0F, float scale_im = 1.0F) {
+                            float32_t scale_re = 1.0F,
+                            float32_t scale_im = 1.0F) {
 
   // Generate batch of matrices
   std::vector<armral_cmplx_f32_t> a(batch_size * m * m);
@@ -77,8 +79,8 @@ gen_invertible_matrix_batch(uint32_t batch_size, uint32_t m,
  * definiteness)
  */
 static inline std::vector<armral_cmplx_f32_t>
-gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
-                     float scale_im = 1.0F, bool perf = false) {
+gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float32_t scale_re = 1.0F,
+                     float32_t scale_im = 1.0F, bool perf = false) {
 
   auto a = perf ? std::vector<armral_cmplx_f32_t>(m * m)
                 : allocate_random_cf32_lin_ind(m * m);
@@ -122,16 +124,15 @@ gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
     // virtue of sampling randomly
 
     // If real-part is zeroed-out increase fac to avoid det(a)=0
-    float fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
+    float32_t fac = (scale_re == 0.0F) ? 2.0F : 1.0F;
 
     for (unsigned i = 0; i < m; ++i) {
       // force non-negative diagonal entries
       a[i * m + i].re = std::abs(a[i * m + i].re);
       for (unsigned j = 0; j < m; ++j) {
         if (i != j) {
-          a[i * m + i].re +=
-              fac *
-              std::abs(std::complex<float>(a[i * m + j].re, a[i * m + j].im));
+          a[i * m + i].re += fac * std::abs(std::complex<float32_t>(
+                                       a[i * m + j].re, a[i * m + j].im));
         }
       }
     }
@@ -146,7 +147,7 @@ gen_hermitian_matrix(uint32_t m, bool is_hpd = false, float scale_re = 1.0F,
  */
 static inline std::vector<armral_cmplx_f32_t>
 gen_hermitian_matrix_batch(uint32_t batch_size, uint32_t m, bool is_hpd = false,
-                           float scale_re = 1.0F, float scale_im = 1.0F,
+                           float32_t scale_re = 1.0F, float32_t scale_im = 1.0F,
                            bool perf = false) {
 
   // Generate batch of matrices
@@ -165,23 +166,23 @@ gen_hermitian_matrix_batch(uint32_t batch_size, uint32_t m, bool is_hpd = false,
  * Function to print check results of matrix inversion UTs.
  */
 static bool check_results_mat_inv(
-    const std::string &name, const float *result, const float *expected,
+    const std::string &name, const float32_t *result, const float32_t *expected,
     const uint32_t n_values, /*n_values = 2 * nSamples, due to RE and IM part)*/
-    const float rel_tol_mult = 1.0F, const float abs_tol_mult = 1.0F,
+    const float32_t rel_tol_mult = 1.0F, const float32_t abs_tol_mult = 1.0F,
     int verbose = 0) {
   bool passed = true;
-  float error = 0;
-  float max_error = 0;
+  float32_t error = 0;
+  float32_t max_error = 0;
   // TODO: arbitrarily chosen constant. we should probably do better than this,
   //       but until we actually talk to people and get an idea of acceptable
   //       tolerances then there's not much point in being too exact here.
-  float relative_tol = 0.00001;  // 10^-5
-  float diff_tolerance = 0.0001; // 10^-4
+  float32_t relative_tol = 0.00001;  // 10^-5
+  float32_t diff_tolerance = 0.0001; // 10^-4
   relative_tol *= rel_tol_mult;
   diff_tolerance *= abs_tol_mult;
 
   for (uint32_t i = 0; i < n_values; ++i) {
-    float diff_abs = fabs(result[i] - expected[i]);
+    float32_t diff_abs = fabs(result[i] - expected[i]);
     error = (expected[i] != 0) ? fabs(diff_abs / expected[i]) : fabs(result[i]);
     max_error = std::max(error, max_error);
 
@@ -267,18 +268,18 @@ static inline bool check_results_identity(const armral_cmplx_f32_t *mat,
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, m64, inv_m64, 0.0F, mm64);
     convert_vector_to_cf32_array(m * m, mm64, mm.data());
-    passed &= check_results_mat_inv("MM^{-1} - Id", (float *)mm.data(),
-                                    (float *)id.data(), 2 * m * m, (float)m,
-                                    (float)m, verbose);
+    passed &= check_results_mat_inv("MM^{-1} - Id", (float32_t *)mm.data(),
+                                    (float32_t *)id.data(), 2 * m * m,
+                                    (float32_t)m, (float32_t)m, verbose);
   }
   // MM^{-1}
   {
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, inv_m64, m64, 0.0F, mm64);
     convert_vector_to_cf32_array(m * m, mm64, mm.data());
-    passed &= check_results_mat_inv("M^{-1}M - Id", (float *)mm.data(),
-                                    (float *)id.data(), 2 * m * m, (float)m,
-                                    (float)m, verbose);
+    passed &= check_results_mat_inv("M^{-1}M - Id", (float32_t *)mm.data(),
+                                    (float32_t *)id.data(), 2 * m * m,
+                                    (float32_t)m, (float32_t)m, verbose);
   }
   return passed;
 }
diff --git a/utils/qint64.hpp b/utils/qint64.hpp
index 8922edf..02ed5b0 100644
--- a/utils/qint64.hpp
+++ b/utils/qint64.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/reference_linalg.hpp b/utils/reference_linalg.hpp
index 0960d44..605b3db 100644
--- a/utils/reference_linalg.hpp
+++ b/utils/reference_linalg.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -257,8 +257,8 @@ std::complex<T> complex_convert(armral_cmplx_f32_t cmplx) {
 
 template<typename T>
 armral_cmplx_f32_t complex_convert(std::complex<T> cmplx) {
-  return armral_cmplx_f32_t{static_cast<float>(cmplx.real()),
-                            static_cast<float>(cmplx.imag())};
+  return armral_cmplx_f32_t{static_cast<float32_t>(cmplx.real()),
+                            static_cast<float32_t>(cmplx.imag())};
 }
 
 /*
@@ -279,7 +279,7 @@ void convert_vector_to_cf32_array(uint16_t nvalues,
                                   const std::vector<std::complex<T>> &a,
                                   armral_cmplx_f32_t *b) {
   for (unsigned i = 0; i < nvalues; ++i) {
-    b[i] = {(float)a[i].real(), (float)a[i].imag()};
+    b[i] = {(float32_t)a[i].real(), (float32_t)a[i].imag()};
   }
 }
 
diff --git a/utils/rng.cpp b/utils/rng.cpp
index e14d97d..7904bc0 100644
--- a/utils/rng.cpp
+++ b/utils/rng.cpp
@@ -1,10 +1,14 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rng.hpp"
 
+#include <arm_neon.h>
+
+namespace armral::utils {
+
 static inline uint64_t lcg_step(uint64_t x) {
   x = (x * 1103515245 + 12345) & 0x7fffffffU;
   return x;
@@ -31,10 +35,10 @@ uint32_t linear_congruential_generator::one<uint32_t>(random_state *state) {
 }
 
 template<>
-float linear_congruential_generator::one<float>(random_state *state) {
+float32_t linear_congruential_generator::one<float32_t>(random_state *state) {
   auto x = lcg_step(state->seed);
   state->seed = x;
-  return (float)x / 0x80000000U;
+  return (float32_t)x / 0x80000000U;
 }
 
 template<>
@@ -64,3 +68,5 @@ random_state::from_seeds(const std::initializer_list<uint64_t> seeds) {
   lcg.advance_state(&state, 3);
   return state;
 }
+
+} // namespace armral::utils
diff --git a/utils/rng.hpp b/utils/rng.hpp
index a6e09dc..fb129b0 100644
--- a/utils/rng.hpp
+++ b/utils/rng.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
@@ -9,6 +9,8 @@
 #include <initializer_list>
 #include <type_traits>
 
+namespace armral::utils {
+
 struct random_state;
 
 class linear_congruential_generator {
@@ -77,3 +79,5 @@ struct random_state {
    */
   static random_state from_seeds(std::initializer_list<uint64_t> seeds);
 };
+
+} // namespace armral::utils
-- 
GitLab


From 30702f0d9ad711d02f159303f91247f4fc4b7d42 Mon Sep 17 00:00:00 2001
From: Nick Dingle <nick.dingle@arm.com>
Date: Tue, 16 Apr 2024 09:20:56 +0000
Subject: [PATCH 2/2] Correct minor documentation errors

---
 README.md        | 2 +-
 docs/examples.md | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6d90bff..83a9a05 100644
--- a/README.md
+++ b/README.md
@@ -385,7 +385,7 @@ file.
 
 The Arm RAN Acceleration Library Reference Guide is available online at:
 
-    <https://developer.arm.com/documentation/102249/2404>
+    https://developer.arm.com/documentation/102249/2404
 
 If you have Doxygen installed on your system, you can build a local HTML version
 of the Arm RAN Acceleration Library documentation using CMake.
diff --git a/docs/examples.md b/docs/examples.md
index b53a8fb..c789c0d 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -148,15 +148,15 @@ included:
 
   The example binary takes three arguments, in the following order:
 
-      1. The polar code size (`N`)
-      2. The rate-matched codeword length (`E`)
-      3. The number of information bits (`K`)
+  1. The polar code size (`N`)
+  2. The rate-matched codeword length (`E`)
+  3. The number of information bits (`K`)
 
   For example, to run a compiled binary of the `polar_example.cpp`, called,
   `polar_example`, with an input array of `N = 128`, `E = 100`, and `K = 35`,
   use:
 
-      ./modulation_example 128 100 35
+      ./polar_example 128 100 35
 
 Each example can be run according to the **Procedure** described above, as
 demonstrated in the **Example: Run 'fft_cf32_example.c'** section.
-- 
GitLab