From 2f304504777627e6540137452e52716ed2d82fb9 Mon Sep 17 00:00:00 2001
From: Nick Dingle <nick.dingle@arm.com>
Date: Thu, 18 Jan 2024 12:28:39 +0000
Subject: [PATCH] Release 24.01

Co-Authored-By: Rosie Sumpter <rosie.sumpter@arm.com>
---
 .clang-tidy                                   |   6 +-
 CMakeLists.txt                                |  31 +-
 CONTRIBUTING.md                               |   4 +-
 CREDITS.md                                    |  18 +
 Doxyfile.in                                   |   2 +-
 README.md                                     |  29 +-
 RELEASE_NOTES.md                              |  76 +--
 bench/CRC/11/BigEndian/bench.py               |   2 +-
 bench/CRC/11/BigEndian/main.cpp               |   2 +-
 bench/CRC/11/LittleEndian/bench.py            |   2 +-
 bench/CRC/11/LittleEndian/main.cpp            |   2 +-
 bench/CRC/16/BigEndian/bench.py               |   2 +-
 bench/CRC/16/BigEndian/main.cpp               |   2 +-
 bench/CRC/16/LittleEndian/bench.py            |   2 +-
 bench/CRC/16/LittleEndian/main.cpp            |   2 +-
 bench/CRC/24/A/BigEndian/bench.py             |   2 +-
 bench/CRC/24/A/BigEndian/main.cpp             |   2 +-
 bench/CRC/24/A/LittleEndian/bench.py          |   2 +-
 bench/CRC/24/A/LittleEndian/main.cpp          |   2 +-
 bench/CRC/24/B/BigEndian/bench.py             |   2 +-
 bench/CRC/24/B/BigEndian/main.cpp             |   2 +-
 bench/CRC/24/B/LittleEndian/bench.py          |   2 +-
 bench/CRC/24/B/LittleEndian/main.cpp          |   2 +-
 bench/CRC/24/C/BigEndian/bench.py             |   2 +-
 bench/CRC/24/C/BigEndian/main.cpp             |   2 +-
 bench/CRC/24/C/LittleEndian/bench.py          |   2 +-
 bench/CRC/24/C/LittleEndian/main.cpp          |   2 +-
 bench/CRC/6/BigEndian/bench.py                |   2 +-
 bench/CRC/6/BigEndian/main.cpp                |   2 +-
 bench/CRC/6/LittleEndian/bench.py             |   2 +-
 bench/CRC/6/LittleEndian/main.cpp             |   2 +-
 bench/ConvCoding/Decoding/bench.py            |   2 +-
 bench/ConvCoding/Decoding/main.cpp            |   2 +-
 bench/ConvCoding/Encoding/bench.py            |   2 +-
 bench/ConvCoding/Encoding/main.cpp            |   2 +-
 bench/Correlation/bench.py                    |   2 +-
 bench/Correlation/main.cpp                    |   2 +-
 bench/Demodulation/bench.py                   |   2 +-
 bench/Demodulation/main.cpp                   |   2 +-
 bench/ElemWiseVectorMult/VecMul16/bench.py    |   2 +-
 bench/ElemWiseVectorMult/VecMul16/main.cpp    |   2 +-
 bench/ElemWiseVectorMult/VecMul16_2/bench.py  |   2 +-
 bench/ElemWiseVectorMult/VecMul16_2/main.cpp  |   2 +-
 bench/ElemWiseVectorMult/VecMul32/bench.py    |   2 +-
 bench/ElemWiseVectorMult/VecMul32/main.cpp    |   2 +-
 bench/ElemWiseVectorMult/VecMul32_2/bench.py  |   2 +-
 bench/ElemWiseVectorMult/VecMul32_2/main.cpp  |   2 +-
 bench/FFT/FFT16/bench.py                      |   2 +-
 bench/FFT/FFT16/main.cpp                      |   2 +-
 bench/FFT/FFT32/bench.py                      |   2 +-
 bench/FFT/FFT32/main.cpp                      |   2 +-
 bench/FIR/FIR16/bench.py                      |   2 +-
 bench/FIR/FIR16/main.cpp                      |   2 +-
 bench/FIR/FIR16Decimate2/bench.py             |   2 +-
 bench/FIR/FIR16Decimate2/main.cpp             |   2 +-
 bench/FIR/FIR32/bench.py                      |   2 +-
 bench/FIR/FIR32/main.cpp                      |   2 +-
 bench/FIR/FIR32Decimate2/bench.py             |   2 +-
 bench/FIR/FIR32Decimate2/main.cpp             |   2 +-
 bench/LDPC/Decoding/bench.py                  |   2 +-
 bench/LDPC/Decoding/main.cpp                  |   6 +-
 bench/LDPC/Encoding/bench.py                  |   5 +-
 bench/LDPC/Encoding/main.cpp                  |   4 +-
 bench/LDPC/RateMatching/bench.py              |  45 ++
 bench/LDPC/RateMatching/main.cpp              |  79 +++
 bench/LDPC/RateRecovery/bench.py              |  46 ++
 bench/LDPC/RateRecovery/main.cpp              |  74 +++
 .../Batch/GeneralMatInv/NonPA/bench.py        |   2 +-
 .../Batch/GeneralMatInv/NonPA/main.cpp        |   2 +-
 .../MatrixInv/Batch/GeneralMatInv/PA/bench.py |   2 +-
 .../MatrixInv/Batch/GeneralMatInv/PA/main.cpp |   2 +-
 .../Batch/HermitianMatInv/NonPA/bench.py      |   2 +-
 .../Batch/HermitianMatInv/NonPA/main.cpp      |   2 +-
 .../Batch/HermitianMatInv/PA/bench.py         |   2 +-
 .../Batch/HermitianMatInv/PA/main.cpp         |   2 +-
 bench/MatrixInv/Single/GeneralMatInv/bench.py |   2 +-
 bench/MatrixInv/Single/GeneralMatInv/main.cpp |   2 +-
 .../MatrixInv/Single/HermitianMatInv/bench.py |   2 +-
 .../MatrixInv/Single/HermitianMatInv/main.cpp |   2 +-
 bench/MatrixMult/Batch/ArmSolve/1x2/bench.py  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/1x4/bench.py  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/2x2/bench.py  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/2x4/bench.py  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/4x4/bench.py  |   2 +-
 bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp  |   2 +-
 .../MatrixVectorMult16/32b/NonPA/bench.py     |   2 +-
 .../MatrixVectorMult16/32b/NonPA/main.cpp     |   2 +-
 .../Batch/MatrixVectorMult16/32b/PA/bench.py  |   2 +-
 .../Batch/MatrixVectorMult16/32b/PA/main.cpp  |   2 +-
 .../MatrixVectorMult16/64b/NonPA/bench.py     |   2 +-
 .../MatrixVectorMult16/64b/NonPA/main.cpp     |   2 +-
 .../Batch/MatrixVectorMult16/64b/PA/bench.py  |   2 +-
 .../Batch/MatrixVectorMult16/64b/PA/main.cpp  |   2 +-
 .../Batch/MatrixVectorMult32/NonPA/bench.py   |   2 +-
 .../Batch/MatrixVectorMult32/NonPA/main.cpp   |   2 +-
 .../Batch/MatrixVectorMult32/PA/bench.py      |   2 +-
 .../Batch/MatrixVectorMult32/PA/main.cpp      |   2 +-
 .../Single/MatrixMult16/32b/bench.py          |   2 +-
 .../Single/MatrixMult16/32b/main.cpp          |   2 +-
 .../Single/MatrixMult16/64b/bench.py          |   2 +-
 .../Single/MatrixMult16/64b/main.cpp          |   2 +-
 .../Single/MatrixMult32/2x2/IQ/bench.py       |   2 +-
 .../Single/MatrixMult32/2x2/IQ/main.cpp       |   2 +-
 .../Single/MatrixMult32/2x2/NonIQ/bench.py    |   2 +-
 .../Single/MatrixMult32/2x2/NonIQ/main.cpp    |   2 +-
 .../Single/MatrixMult32/4x4/IQ/bench.py       |   2 +-
 .../Single/MatrixMult32/4x4/IQ/main.cpp       |   2 +-
 .../Single/MatrixMult32/4x4/NonIQ/bench.py    |   2 +-
 .../Single/MatrixMult32/4x4/NonIQ/main.cpp    |   2 +-
 .../Single/MatrixMult32/general/bench.py      |   2 +-
 .../Single/MatrixMult32/general/main.cpp      |   2 +-
 .../Single/MatrixMultAAH32/bench.py           |   2 +-
 .../Single/MatrixMultAAH32/main.cpp           |   2 +-
 .../Single/MatrixMultAHB32/bench.py           |   2 +-
 .../Single/MatrixMultAHB32/main.cpp           |   2 +-
 .../Single/MatrixVectorMult16/32bit/bench.py  |   2 +-
 .../Single/MatrixVectorMult16/32bit/main.cpp  |   2 +-
 .../Single/MatrixVectorMult16/64bit/bench.py  |   2 +-
 .../Single/MatrixVectorMult16/64bit/main.cpp  |   2 +-
 .../Single/MatrixVectorMult32/bench.py        |   2 +-
 .../Single/MatrixVectorMult32/main.cpp        |   2 +-
 bench/MatrixPseudoInv/Direct/bench.py         |   8 +-
 bench/MatrixPseudoInv/Direct/main.cpp         |   8 +-
 bench/Modulation/bench.py                     |   2 +-
 bench/Modulation/main.cpp                     |   2 +-
 bench/MuLaw/Compression/14bit/bench.py        |   2 +-
 bench/MuLaw/Compression/14bit/main.cpp        |   2 +-
 bench/MuLaw/Compression/8bit/bench.py         |   2 +-
 bench/MuLaw/Compression/8bit/main.cpp         |   2 +-
 bench/MuLaw/Compression/9bit/bench.py         |   2 +-
 bench/MuLaw/Compression/9bit/main.cpp         |   2 +-
 bench/MuLaw/Decompression/14bit/bench.py      |   2 +-
 bench/MuLaw/Decompression/14bit/main.cpp      |   2 +-
 bench/MuLaw/Decompression/8bit/bench.py       |   2 +-
 bench/MuLaw/Decompression/8bit/main.cpp       |   2 +-
 bench/MuLaw/Decompression/9bit/bench.py       |   2 +-
 bench/MuLaw/Decompression/9bit/main.cpp       |   2 +-
 .../Compression/14bit/bench.py                |   2 +-
 .../Compression/14bit/main.cpp                |   2 +-
 .../Compression/8bit/bench.py                 |   2 +-
 .../Compression/8bit/main.cpp                 |   2 +-
 .../Compression/9bit/bench.py                 |   2 +-
 .../Compression/9bit/main.cpp                 |   2 +-
 .../Decompression/14bit/bench.py              |   2 +-
 .../Decompression/14bit/main.cpp              |   2 +-
 .../Decompression/8bit/bench.py               |   2 +-
 .../Decompression/8bit/main.cpp               |   2 +-
 .../Decompression/9bit/bench.py               |   2 +-
 .../Decompression/9bit/main.cpp               |   2 +-
 bench/Polar/Decoding/bench.py                 |   2 +-
 bench/Polar/Decoding/main.cpp                 |   2 +-
 bench/Polar/Encoding/bench.py                 |   2 +-
 bench/Polar/Encoding/main.cpp                 |   2 +-
 bench/Polar/Frozen/bench.py                   |   2 +-
 bench/Polar/Frozen/main.cpp                   |   2 +-
 bench/Polar/RateMatching/bench.py             |   2 +-
 bench/Polar/RateMatching/main.cpp             |   2 +-
 bench/Polar/RateRecovery/bench.py             |   2 +-
 bench/Polar/RateRecovery/main.cpp             |   2 +-
 bench/Polar/SubchannelDeinterleave/bench.py   |   2 +-
 bench/Polar/SubchannelDeinterleave/main.cpp   |   2 +-
 bench/Polar/SubchannelInterleave/bench.py     |   2 +-
 bench/Polar/SubchannelInterleave/main.cpp     |   2 +-
 bench/SVD/bench.py                            |   2 +-
 bench/SVD/main.cpp                            |   2 +-
 bench/Scrambling/bench.py                     |   2 +-
 bench/Scrambling/main.cpp                     |   2 +-
 bench/SeqGenerator/bench.py                   |   2 +-
 bench/SeqGenerator/main.cpp                   |   2 +-
 bench/Turbo/Decoding/bench.py                 |   2 +-
 bench/Turbo/Decoding/main.cpp                 |   2 +-
 bench/Turbo/Encoding/bench.py                 |   2 +-
 bench/Turbo/Encoding/main.cpp                 |   2 +-
 bench/Turbo/RateMatching/bench.py             |   2 +-
 bench/Turbo/RateMatching/main.cpp             |   2 +-
 bench/Turbo/RateRecovery/bench.py             |   2 +-
 bench/Turbo/RateRecovery/main.cpp             |   2 +-
 bench/VectorDotProd/VecDot16/bench.py         |   2 +-
 bench/VectorDotProd/VecDot16/main.cpp         |   2 +-
 bench/VectorDotProd/VecDot16_2/bench.py       |   2 +-
 bench/VectorDotProd/VecDot16_2/main.cpp       |   2 +-
 bench/VectorDotProd/VecDot16_2_32bit/bench.py |   2 +-
 bench/VectorDotProd/VecDot16_2_32bit/main.cpp |   2 +-
 bench/VectorDotProd/VecDot16_32bit/bench.py   |   2 +-
 bench/VectorDotProd/VecDot16_32bit/main.cpp   |   2 +-
 bench/VectorDotProd/VecDot32/bench.py         |   2 +-
 bench/VectorDotProd/VecDot32/main.cpp         |   2 +-
 bench/VectorDotProd/VecDot32_2/bench.py       |   2 +-
 bench/VectorDotProd/VecDot32_2/main.cpp       |   2 +-
 .../XRanBlockFloat/Compression/12bit/bench.py |   2 +-
 .../XRanBlockFloat/Compression/12bit/main.cpp |   4 +-
 .../XRanBlockFloat/Compression/14bit/bench.py |   2 +-
 .../XRanBlockFloat/Compression/14bit/main.cpp |   4 +-
 .../XRanBlockFloat/Compression/8bit/bench.py  |   2 +-
 .../XRanBlockFloat/Compression/8bit/main.cpp  |   4 +-
 .../XRanBlockFloat/Compression/9bit/bench.py  |   2 +-
 .../XRanBlockFloat/Compression/9bit/main.cpp  |   4 +-
 .../Decompression/12bit/bench.py              |   2 +-
 .../Decompression/12bit/main.cpp              |   2 +-
 .../Decompression/14bit/bench.py              |   2 +-
 .../Decompression/14bit/main.cpp              |   2 +-
 .../Decompression/8bit/bench.py               |   2 +-
 .../Decompression/8bit/main.cpp               |   2 +-
 .../Decompression/9bit/bench.py               |   2 +-
 .../Decompression/9bit/main.cpp               |   2 +-
 bench/benchmarker.py                          |   2 +-
 bench/benchmarker_utils.py                    |   2 +-
 bench/default_runner.py                       |   2 +-
 docs/doxywrapper/arm_footer.html              |   4 +-
 docs/doxywrapper/proprietary_notice.html      |   2 +-
 docs/examples.md                              |   6 +-
 docs/frontmatter.md                           |   7 +-
 examples/block_float_9b_example.c             |   2 +-
 examples/fft_cf32_example.c                   |   2 +-
 examples/modulation_example.c                 |   2 +-
 examples/polar_example.cpp                    |   2 +-
 include/.clang-tidy                           |   1 -
 include/armral.h                              |  99 ++-
 license_terms/BSD-3-Clause.txt                |   2 +-
 simulation/README.md                          |   2 +-
 simulation/awgn/awgn.cpp                      |   2 +-
 simulation/awgn/awgn.h                        |   2 +-
 simulation/capacity/capacity.py               |   2 +-
 simulation/convolutional_awgn/CMakeLists.txt  |   1 +
 .../convolutional_awgn/convolutional_awgn.cpp |   2 +-
 .../convolutional_error_rate.py               |   2 +-
 simulation/include/simulation_common.hpp      |   2 +-
 simulation/include/simulation_common.py       |   2 +-
 simulation/ldpc_awgn/CMakeLists.txt           |   1 +
 simulation/ldpc_awgn/ldpc_awgn.cpp            |  10 +-
 simulation/ldpc_awgn/ldpc_error_rate.py       |   2 +-
 simulation/modulation_awgn/CMakeLists.txt     |   1 +
 .../modulation_awgn/modulation_awgn.cpp       |   2 +-
 .../modulation_awgn/modulation_error_rate.py  |   2 +-
 simulation/polar_awgn/CMakeLists.txt          |   1 +
 simulation/polar_awgn/polar_awgn.cpp          |   3 +-
 simulation/polar_awgn/polar_error_rate.py     |   2 +-
 simulation/turbo_awgn/CMakeLists.txt          |   1 +
 simulation/turbo_awgn/turbo_awgn.cpp          |   2 +-
 simulation/turbo_awgn/turbo_error_rate.py     |   2 +-
 .../arm_cmplx_hermitian_mat_inversion_f32.cpp |   2 +-
 .../MatrixInv/arm_cmplx_mat_inversion_f32.cpp |   2 +-
 .../cmplx_hermitian_mat_inversion_f32.hpp     |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_ahb_f32.c   |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_f32.c       |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_i16.c       |   2 +-
 .../MatrixMult/arm_cmplx_mat_mult_i16_32bit.c |   2 +-
 .../MatrixMult/arm_cmplx_mat_vec_mult_f32.c   |   2 +-
 .../MatrixMult/arm_cmplx_mat_vec_mult_i16.c   |   2 +-
 .../arm_cmplx_mat_vec_mult_i16_32bit.c        |   7 +-
 src/BasicMathFun/MatrixMult/arm_solve_1sc.c   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_1sc.h   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_4sc.c   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_4sc.h   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_6sc.c   |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_6sc.h   |   2 +-
 .../MatrixMult/arm_solve_convert.h            |   2 +-
 src/BasicMathFun/MatrixMult/arm_solve_f32.c   |   2 +-
 .../arm_cmplx_pseudo_inverse_direct_f32.cpp   |  88 ++-
 .../cmplx_mat_pseudo_inverse.hpp              |  69 ++
 .../VectorDotProd/arm_cmplx_vecdot_f32.c      |   2 +-
 .../VectorDotProd/arm_cmplx_vecdot_f32_2.c    |  10 +-
 .../VectorDotProd/arm_cmplx_vecdot_i16.c      |   2 +-
 .../VectorDotProd/arm_cmplx_vecdot_i16_2.c    |   2 +-
 .../arm_cmplx_vecdot_i16_2_32bit.c            |   2 +-
 .../arm_cmplx_vecdot_i16_32bit.c              |   2 +-
 .../VectorMult/arm_cmplx_vecmul_f32.c         |   2 +-
 .../VectorMult/arm_cmplx_vecmul_f32_2.c       |   2 +-
 .../VectorMult/arm_cmplx_vecmul_i16.cpp       |   2 +-
 .../VectorMult/arm_cmplx_vecmul_i16_2.c       |   2 +-
 .../arm_mu_law_compression.cpp                |   2 +-
 .../arm_mu_law_decompression.cpp              |   2 +-
 .../arm_block_float_compression.cpp           |   2 +-
 .../arm_block_float_decompression.cpp         |   2 +-
 .../arm_block_scaling_compression.cpp         |   2 +-
 .../arm_block_scaling_decompression.cpp       |   2 +-
 src/DuRuInterface/bit_packing_common.hpp      |   2 +-
 src/LowerPHY/Correlation/arm_correlation.c    |   2 +-
 src/LowerPHY/FFT/fft_cf32.cpp                 |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c |  38 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c |  38 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c |  38 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c |  62 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c |  62 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c |  38 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c |  62 +-
 src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cf32_kernel_lookup.c     |   2 +-
 src/LowerPHY/FFT/fft_cf32_kernel_lookup.h     |   2 +-
 src/LowerPHY/FFT/fft_cs16.cpp                 |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c |  62 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c |  62 +-
 src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h |   2 +-
 src/LowerPHY/FFT/fft_cs16_kernel_lookup.c     |   2 +-
 src/LowerPHY/FFT/fft_cs16_kernel_lookup.h     |   2 +-
 src/LowerPHY/FFT/fft_execute.cpp              |   2 +-
 src/LowerPHY/FFT/fft_execute.hpp              |   2 +-
 src/LowerPHY/FFT/fft_helper.h                 |   2 +-
 src/LowerPHY/FFT/fft_level.cpp                |   2 +-
 src/LowerPHY/FFT/fft_level.hpp                |   2 +-
 src/LowerPHY/FFT/fft_plan.cpp                 |   2 +-
 src/LowerPHY/FFT/fft_plan.hpp                 |   2 +-
 src/LowerPHY/FFT/fft_types.hpp                |   2 +-
 src/LowerPHY/FFT/rader.cpp                    |   2 +-
 src/LowerPHY/FFT/rader.hpp                    |   2 +-
 src/LowerPHY/FFT/rader_generator.cpp          |   2 +-
 src/LowerPHY/FFT/rader_generator.hpp          |   2 +-
 src/LowerPHY/FIR/arm_fir_filter_cf32.c        |   4 +-
 .../FIR/arm_fir_filter_cf32_decimate_2.c      |   6 +-
 src/LowerPHY/FIR/arm_fir_filter_cs16.c        |  11 +-
 .../FIR/arm_fir_filter_cs16_decimate_2.c      |   2 +-
 src/LowerPHY/Scrambling/arm_scrambling.cpp    |   2 +-
 .../SeqGenerator/arm_mat_seq_generator.cpp    |  18 +-
 src/SVD/arm_svd.cpp                           |   4 +-
 src/SVD/matrix_view.hpp                       |   6 +-
 src/UpperPHY/CRC/arm_crc11.cpp                |   4 +-
 src/UpperPHY/CRC/arm_crc16.cpp                |   4 +-
 src/UpperPHY/CRC/arm_crc24_a.cpp              |   4 +-
 src/UpperPHY/CRC/arm_crc24_b.cpp              |   4 +-
 src/UpperPHY/CRC/arm_crc24_c.cpp              |   4 +-
 src/UpperPHY/CRC/arm_crc6.cpp                 |   4 +-
 src/UpperPHY/CRC/crc_basic.hpp                |   2 +-
 src/UpperPHY/CRC/crc_common.hpp               |  13 +-
 .../arm_convolutional_decoder.cpp             |  19 +-
 .../arm_convolutional_encoder.cpp             |   2 +-
 .../convolutional_code_table.hpp              |   2 +-
 src/UpperPHY/Demodulation/arm_demodulation.c  |   2 +-
 src/UpperPHY/LDPC/ldpc_coding.hpp             |   7 +-
 src/UpperPHY/LDPC/ldpc_decoder.cpp            |  92 +--
 src/UpperPHY/LDPC/ldpc_encoder.cpp            |  46 +-
 src/UpperPHY/LDPC/ldpc_rate_common.hpp        |  30 +
 src/UpperPHY/LDPC/ldpc_rate_matching.cpp      |  42 +-
 src/UpperPHY/LDPC/ldpc_rate_recovery.cpp      |  28 +-
 src/UpperPHY/Modulation/arm_modulation.c      |   2 +-
 .../Polar/arm_polar_crc_attachment.cpp        |   2 +-
 src/UpperPHY/Polar/arm_polar_crc_check.cpp    |   2 +-
 src/UpperPHY/Polar/arm_polar_decoder.cpp      | 611 ++----------------
 src/UpperPHY/Polar/arm_polar_decoder.hpp      |  21 +
 src/UpperPHY/Polar/arm_polar_decoder_neon.hpp | 507 +++++++++++++++
 src/UpperPHY/Polar/arm_polar_encoder.c        |   2 +-
 src/UpperPHY/Polar/arm_polar_frozen_bits.cpp  |   2 +-
 .../Polar/arm_polar_rate_matching.cpp         |   2 +-
 .../Polar/arm_polar_rate_recovery.cpp         |   2 +-
 .../arm_polar_subchannel_deinterleave.cpp     |   2 +-
 .../Polar/arm_polar_subchannel_interleave.cpp |   2 +-
 src/UpperPHY/Turbo/arm_turbo_decoder.cpp      | 135 ++--
 src/UpperPHY/Turbo/arm_turbo_encoder.cpp      |  16 +-
 .../Turbo/arm_turbo_rate_matching.cpp         |  35 +-
 .../Turbo/arm_turbo_rate_recovery.cpp         |  51 +-
 src/UpperPHY/Turbo/turbo_code.hpp             |   2 +-
 src/UpperPHY/Turbo/turbo_tables.hpp           |   6 +-
 src/intrinsics.h                              |  29 +-
 src/utils/allocators.hpp                      |   2 +-
 src/utils/cmplx_arith_f32.hpp                 |   2 +-
 src/utils/vec_mul.hpp                         |   2 +-
 test/CRC/main.cpp                             |   2 +-
 test/ConvCoding/decoding/main.cpp             |   4 +-
 test/ConvCoding/encoding/main.cpp             |   4 +-
 test/Correlation/main.cpp                     |   2 +-
 test/Demodulation/main.cpp                    |   2 +-
 test/ElemWiseVectorMult/vecMul16/main.cpp     |   4 +-
 test/ElemWiseVectorMult/vecMul16_2/main.cpp   |   4 +-
 test/ElemWiseVectorMult/vecMul32/main.cpp     |   4 +-
 test/ElemWiseVectorMult/vecMul32_2/main.cpp   |   4 +-
 test/FFT/cf32/main.cpp                        |   2 +-
 test/FFT/cs16/main.cpp                        |   2 +-
 test/FIR/arm_fir_filter_cf32/main.cpp         |   4 +-
 .../arm_fir_filter_cf32_decimate_2/main.cpp   |   4 +-
 test/FIR/arm_fir_filter_cs16/main.cpp         |   2 +-
 .../arm_fir_filter_cs16_decimate_2/main.cpp   |   4 +-
 test/LDPC/decoding/main.cpp                   |   2 +-
 test/LDPC/encoding/ldpc_encoding_test_data.h  |   2 +-
 test/LDPC/encoding/main.cpp                   |   9 +-
 test/LDPC/ldpc_test_common.hpp                |   4 +-
 test/LDPC/rate_matching/main.cpp              |  17 +-
 test/LDPC/rate_recovery/main.cpp              |  10 +-
 test/MatrixInv/batch/main.cpp                 |  12 +-
 test/MatrixInv/single/main.cpp                |   2 +-
 test/MatrixMult/batch/ArmSolve/main.cpp       |   2 +-
 .../batch/MatrixVectorMult16/main.cpp         |   2 +-
 .../batch/MatrixVectorMult32/main.cpp         |   2 +-
 test/MatrixMult/single/MatrixMult16/main.cpp  |   2 +-
 test/MatrixMult/single/MatrixMult32/main.cpp  |   2 +-
 .../single/MatrixMultAAH32/main.cpp           |   3 +-
 .../single/MatrixMultAHB32/main.cpp           |   3 +-
 .../single/MatrixVectorMult16/main.cpp        |   2 +-
 .../single/MatrixVectorMult32/main.cpp        |   2 +-
 test/MatrixPseudoInv/direct/main.cpp          |  88 ++-
 test/Modulation/main.cpp                      |   2 +-
 test/MuLaw/Compression/main.cpp               |   2 +-
 test/MuLaw/Decompression/main.cpp             |   2 +-
 test/ORanBlockScaling/Compression/main.cpp    |   2 +-
 test/ORanBlockScaling/Decompression/main.cpp  |   2 +-
 test/Polar/crc_attachment/main.cpp            |   2 +-
 .../crc_attachment/polar_crc_attach_data.hpp  |   2 +-
 test/Polar/decoding/main.cpp                  |  33 +-
 test/Polar/encoding/main.cpp                  |   2 +-
 test/Polar/frozen/main.cpp                    |   2 +-
 test/Polar/rate_matching/main.cpp             |   2 +-
 test/Polar/rate_recovery/main.cpp             |   6 +-
 test/Polar/subchannel_deinterleave/main.cpp   |   2 +-
 test/Polar/subchannel_interleave/main.cpp     |   2 +-
 test/SVD/main.cpp                             |   4 +-
 test/SVD/svd_sample_data.h                    |   2 +-
 test/SVD/svd_test.hpp                         |  32 +-
 test/Scrambling/main.cpp                      |   2 +-
 test/SeqGenerator/main.cpp                    |   2 +-
 test/Turbo/decoding/main.cpp                  |   3 +-
 test/Turbo/encoding/main.cpp                  |   2 +-
 .../encoding/reference_turbo_encoder.hpp      |   2 +-
 test/Turbo/rate_matching/main.cpp             |   2 +-
 test/Turbo/rate_recovery/main.cpp             |   2 +-
 .../rate_recovery/rate_recovery_data.hpp      |   2 +-
 test/Turbo/turbo_test_data.hpp                |   2 +-
 test/VectorDotProd/vecDot16/main.cpp          |   2 +-
 test/VectorDotProd/vecDot16_2/main.cpp        |   4 +-
 test/VectorDotProd/vecDot16_2_32bit/main.cpp  |   4 +-
 test/VectorDotProd/vecDot16_32bit/main.cpp    |   2 +-
 test/VectorDotProd/vecDot32/main.cpp          |   4 +-
 test/VectorDotProd/vecDot32_2/main.cpp        |   4 +-
 test/XRanBlockFloat/Compression/main.cpp      |   2 +-
 test/XRanBlockFloat/Decompression/main.cpp    |   2 +-
 utils/bit_utils.hpp                           |   2 +-
 utils/cf32_utils.hpp                          |   2 +-
 utils/cs16_utils.hpp                          |   2 +-
 utils/fft_utils.hpp                           |   2 +-
 utils/int8_utils.hpp                          |   2 +-
 utils/matrix_utils.hpp                        | 282 +-------
 utils/qint64.hpp                              |   2 +-
 utils/reference_linalg.hpp                    | 296 ++++++++-
 utils/rng.cpp                                 |   2 +-
 utils/rng.hpp                                 |   2 +-
 444 files changed, 2550 insertions(+), 1977 deletions(-)
 create mode 100755 bench/LDPC/RateMatching/bench.py
 create mode 100644 bench/LDPC/RateMatching/main.cpp
 create mode 100755 bench/LDPC/RateRecovery/bench.py
 create mode 100644 bench/LDPC/RateRecovery/main.cpp
 create mode 100644 src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
 create mode 100644 src/UpperPHY/LDPC/ldpc_rate_common.hpp
 create mode 100644 src/UpperPHY/Polar/arm_polar_decoder.hpp
 create mode 100644 src/UpperPHY/Polar/arm_polar_decoder_neon.hpp

diff --git a/.clang-tidy b/.clang-tidy
index 90197ae..e5ada05 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,9 +1,8 @@
 ---
 Checks: '-*,readability*,-readability-magic-numbers,-readability-function-size,-readability-function-cognitive-complexity,-readability-identifier-length'
-WarningsAsErrors: 'readability*'
+WarningsAsErrors: '*'
 HeaderFilterRegex: '*.h,*.hpp'
-AnalyzeTemporaryDtors: false
-FormatStyle:     file
+FormatStyle: 'file'
 CheckOptions:
   - { key: readability-identifier-naming.ClassCase, value: lower_case }
   - { key: readability-identifier-naming.StructCase, value: lower_case }
@@ -62,4 +61,3 @@ CheckOptions:
   - key:             google-readability-function-size.StatementThreshold
     value:           '800'
 ...
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 591f724..27da4d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.3)
-project(armral VERSION 23.10)
+project(armral VERSION 24.01)
 
 if(CMAKE_VERSION VERSION_GREATER 3.4)
   # stop CMake from automatically adding -rdynamic to linker flags
@@ -16,6 +16,7 @@ endif()
 
 option(ARMRAL_ENABLE_WERROR "Enable -Werror when building the library and tests" OFF)
 option(ARMRAL_ENABLE_ASAN "Enable AddressSanitizer when building the library and tests" OFF)
+option(ARMRAL_ENABLE_EFENCE "Enable Electric Fence when building the library and tests" OFF)
 option(ARMRAL_ENABLE_COVERAGE "Enable instrumentation for generating code coverage" OFF)
 option(BUILD_SIMULATION "Enable building channel simulation programs" ON)
 set(ARMRAL_ARCH NEON CACHE STRING "The architecture to build for ('NEON' or 'SVE2')")
@@ -120,6 +121,15 @@ if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS)
   # Note: We don't universally enable this flag, as in some cases it can cause regressions.
   set_property(SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
     APPEND PROPERTY COMPILE_OPTIONS $<$<AND:$<CONFIG:RELEASE>,$<C_COMPILER_ID:GNU>>:-frename-registers>)
+
+  if(ARMRAL_ENABLE_WERROR)
+    # Disable warnings-as-errors about C-style Variable Length Arrays in FFT source when using Clang++
+    set_property(SOURCE
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp
+      APPEND PROPERTY COMPILE_OPTIONS $<$<CXX_COMPILER_ID:Clang>:-Wno-error=vla-extension>)
+  endif()
 endif()
 
 set(ARMRAL_UTIL_SOURCES
@@ -160,7 +170,7 @@ if(ARMRAL_ENABLE_WERROR)
     message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_WERROR")
   else()
     set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -Werror)
-endif()
+  endif()
 endif()
 
 if(ARMRAL_ENABLE_ASAN)
@@ -172,12 +182,21 @@ if(ARMRAL_ENABLE_ASAN)
   endif()
 endif()
 
+if(ARMRAL_ENABLE_EFENCE)
+  if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
+    message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_EFENCE")
+  else()
+    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} -lefence)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -lefence)
+  endif()
+endif()
+
 if(ARMRAL_ENABLE_COVERAGE)
   if(ARMRAL_OVERRIDE_COMPILE_FLAGS)
     message(WARNING "CMAKE_C_FLAGS and CMAKE_CXX_FLAGS manually specified. Ignoring option ARMRAL_ENABLE_COVERAGE")
   else()
-    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage)
-    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage)
+    set(ARMRAL_COMPILER_FLAGS ${ARMRAL_COMPILER_FLAGS} --coverage -fprofile-update=atomic)
+    set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} --coverage -fprofile-update=atomic)
   endif()
 endif()
 
@@ -440,6 +459,8 @@ endfunction()
   add_armral_bench(arm_fir_filter_cf32_decimate_2 bench/FIR/FIR32Decimate2/main.cpp)
   add_armral_bench(ldpc_decoding bench/LDPC/Decoding/main.cpp)
   add_armral_bench(ldpc_encoding bench/LDPC/Encoding/main.cpp)
+  add_armral_bench(ldpc_rate_matching bench/LDPC/RateMatching/main.cpp)
+  add_armral_bench(ldpc_rate_recovery bench/LDPC/RateRecovery/main.cpp)
   add_armral_bench(matrix_inv_single_general bench/MatrixInv/Single/GeneralMatInv/main.cpp)
   add_armral_bench(matrix_inv_single_hermitian bench/MatrixInv/Single/HermitianMatInv/main.cpp)
   add_armral_bench(matrix_inv_batch_general bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp)
@@ -583,7 +604,7 @@ set (COMP_ERR_MSG "Compilation is only supported with GNU versions 7, 8, 9, 10,
                   11, 12, 13, or Clang versions greater than or equal to 12.0.1. \
                   If compilation fails please use one of the supported compilers.")
 if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION VERSION_GREATER 13.1)
+  if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION VERSION_GREATER 13.2)
     message(WARNING ${COMP_ERR_MSG})
   endif()
 elseif (CMAKE_C_COMPILER_ID STREQUAL "Clang")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a730ee5..24d142a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -416,7 +416,7 @@ The following code block provides a template for the `bench.py` script.
 ```py
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
@@ -482,7 +482,7 @@ The following code block provides a basic template.
 ```cpp
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/CREDITS.md b/CREDITS.md
index 8605fae..467a1b1 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -2,6 +2,23 @@ In addition to the primary development being done by Arm, the
 following people and organizations have contributed to Arm RAN
 Acceleration Library:
 
+- Work on `armral_ldpc_rate_recovery` to correctly set the
+  log-likelihood ratios of filler bits was contributed upstream by
+  4g5g Consultants. See
+  https://gitlab.arm.com/networking/ral/-/merge_requests/6.
+
+- Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery`
+  to support the addition and removal of filler bits when the soft
+  buffer size is less than the full buffer size was contributed
+  upstream by 4g5g Consultants. See
+  https://gitlab.arm.com/networking/ral/-/merge_requests/5.
+
+- Work on `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
+  `armral_ldpc_rate_recovery` to support the addition and removal of
+  filler bits when the code block size is not a multiple of lifting
+  set size was contributed upstream by 4g5g Consultants. See
+  https://gitlab.arm.com/networking/ral/-/merge_requests/4
+
 - Work on `armral_seq_generator` to extend the `sequence_len`
   parameter to `uint32_t` was contributed upstream by 4g5g
   Consultants. See
@@ -17,3 +34,4 @@ Acceleration Library:
   Consultants. See
   https://gitlab.arm.com/networking/ral/-/merge_requests/1
 
+
diff --git a/Doxyfile.in b/Doxyfile.in
index 6d6ad1a..c470dc9 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Arm RAN Acceleration Library Reference Guide"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "23.10"
+PROJECT_NUMBER         = "24.01"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/README.md b/README.md
index f8d16ad..bcaea5e 100644
--- a/README.md
+++ b/README.md
@@ -60,9 +60,10 @@ to download the source code.
      but are required if you want to run the library tests (`-DBUILD_TESTING`)
      and benchmarks (`-DBUILD_EXAMPLES`).
 
-   * The `-DCMAKE_INSTALL_DIR=<install-dir>` option is optional and sets the
-     install location (`<install-dir>`) for the library. The default location is
-     `/usr/local`.
+   * The `-DCMAKE_INSTALL_PREFIX=<install-dir>` option is optional and
+     specifies the base directory used to install the library. The library
+     archive is installed to `<install-dir>/lib` and headers are installed to
+     `<install-dir>/include`. The default location is `/usr/local`.
 
    * By default, a static library is built. To build a dynamic or a static
      library use the `-DBUILD_SHARED_LIBS={On|Off}` option.
@@ -73,14 +74,6 @@ to download the source code.
 
    Other common CMake `{options}` include:
 
-   * `-DCMAKE_INSTALL_PREFIX=<path>`
-
-       Specifies the base directory used to install the library. The library
-       archive is installed to `<path>/lib` and headers are installed to
-       `<path>/include`.
-
-       Default `<path>` is `/usr/local`.
-
    * `-DCMAKE_BUILD_TYPE={Debug|Release}`
 
        Specifies the set of flags used to build the library. The default is
@@ -164,11 +157,21 @@ to download the source code.
 
        Enable AddressSanitizer when building the library and tests.
        AddressSanitizer adds extra runtime checks to enable you to catch
-       errors, such as reads or writes off the end of arrays.
+       memory errors, such as reading or writing past the end of an array.
        `-DARMRAL_ENABLE_ASAN=On` incurs some reduction in runtime performance.
 
        Default is `Off`.
 
+   * `-DARMRAL_ENABLE_EFENCE={On|Off}`
+
+       Enable Electric Fence when building the library and tests.
+       Electric Fence will cause tests to segmentation fault in the presence
+       of memory errors, such as reading or writing past the end of an array.
+       This option allows you to test executables running under a test runner
+       such as QEMU.
+
+       Default is `Off`.
+
    * `-DARMRAL_ENABLE_COVERAGE={On|Off}`
 
        Enable (`On`), or disable (`Off`), code coverage instrumentation when
@@ -362,7 +365,7 @@ file.
 
 The Arm RAN Acceleration Library Reference Guide is available online at:
 
-    https://developer.arm.com/documentation/102249/2310
+    https://developer.arm.com/documentation/102249/2401
 
 If you have Doxygen installed on your system, you can build a local HTML version
 of the Arm RAN Acceleration Library documentation using CMake.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 9e04ed3..eef47a1 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,7 +1,7 @@
-# Arm RAN Acceleration Library 23.10 Release Note
+# Arm RAN Acceleration Library 24.01 Release Note
 
 Non-Confidential
-Copyright © 2020-2023 Arm Limited (or its affiliates). All rights reserved.
+Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
 Arm conventions and proprietary notices, including confidentiality status,
 terminology statement, and product release status, can be found at the end of
@@ -31,7 +31,7 @@ The Arm RAN Acceleration Library (ArmRAL) contains a set of functions for
 accelerating telecommunications applications such as, but not limited to, 5G
 Radio Access Networks (RANs).
 
-The Arm RAN Acceleration Library 23.10 package provides a library that is
+The Arm RAN Acceleration Library 24.01 package provides a library that is
 optimized for Arm AArch64-based processors.
 
 Arm RAN Acceleration Library provides:
@@ -47,7 +47,7 @@ integers and 32-bit floating-point values.
 
 ## Release Status
 
-This is the 23.10 release of Arm RAN Acceleration Library.
+This is the 24.01 release of Arm RAN Acceleration Library.
 
 These deliverables are being released under the terms of the agreement between
 Arm and each licensee (the "Agreement"). All planned verification and
@@ -101,19 +101,19 @@ source from the Arm Developer website and then unpack the contents.
 
 5. Extract the tar file contents using a tar utility:
 
-    tar zxvf ral-armral-23.10.tar.gz
+    tar zxvf ral-armral-24.01.tar.gz
 
 ## Deliverables
 
 The downloaded product includes the deliverables listed in this section.
 
-- Arm RAN Acceleration Library 23.10
+- Arm RAN Acceleration Library 24.01
 - Release Notes (this document)
 - Documentation
 
   Product documentation is available on the Arm Developer website at:
 
-    https://developer.arm.com/documentation/102249/2310
+    https://developer.arm.com/documentation/102249/2401
 
   **Note:** Documentation, errata and release notes might change between product
   releases. For the latest documentation bundle, check the product download
@@ -134,58 +134,58 @@ Arm RAN Acceleration Library.
 Describes new features or any technical changes to features or
 components in this release.
 
-- Extended the `sequence_len` parameter of `armral_seq_generator` to
-  `uint32_t`. This work was contributed upstream by 4g5g Consultants.
+- Added support for the addition and removal of filler bits in
+  `armral_ldpc_encode_block`, `armral_ldpc_rate_matching` and
+  `armral_ldpc_rate_recovery` when the code block size is not a
+  multiple of lifting set size or when the soft buffer size is less
+  than the full buffer size. This process is described in the 3GPP
+  Technical Specification (TS) 38.212. This work was contributed
+  upstream by 4g5g Consultants.
 
-- Added parameter `i_bil` to `armral_polar_rate_matching` and
-  `armral_polar_rate_recovery` to enable or disable bit
-  interleaving. This work was contributed upstream by 4g5g
-  Consultants.
-
-- Added parameter `nref` to `armral_ldpc_rate_matching` and
-  `armral_ldpc_rate_recovery` to enable the routines to be used with a
-  soft buffer size. This work was contributed upstream by 4g5g
-  Consultants.
-
-- Added parameter `nref` to `armral_ldpc_rate_matching` and
-  `armral_ldpc_rate_recovery` to enable the routines to be used with a
-  soft buffer size. This work was contributed by Suraj Chalapathy from
-  4g5g Consultants.
+- Extended `armral_cmplx_pseudo_inverse_direct_f32` and
+  `armral_cmplx_pseudo_inverse_direct_f32_noalloc` to compute the
+  regularized pseudo-inverse of a single complex 32-bit matrix of size
+  `M-by-N` for cases where `M > N` in addition to the cases where `M
+  <= N`.
 
 ### Performance improvements
 
 Describes any features or components whose performance has improved in
 the current release compared with the previous release.
 
-- Performance improvements for Neon implementations of the following routines:
+- Performance improvements for the following routines:
 
-  * Polar block decoding (`armral_polar_decode_block`) for list
-    lengths 1, 2, 4 and 8.
+  * `armral_turbo_decode_block` and `armral_turbo_decode_block_noalloc`.
 
-  * LDPC block decoding (`armral_ldpc_decode_block` and
-    `armral_ldpc_decode_block_noalloc`).
+- Performance improvements for SVE2 implementations of the following routines:
+
+  * `armral_seq_generator`, for the cases when `sequence_len` is not a
+    multiple of 64.
 
 ### Changes to simulation programs
 
 Describes any changes, new features or components added to the channel
 simulation programs in this release.
 
-- Simulation programs are now built by default and are tested by the
-  make check target.
+- Added support for the addition and removal of filler bits in
+  `ldpc_awgn` when the code block size is not a multiple of lifting
+  set size. This work was contributed upstream by 4g5g Consultants.
 
 ### Resolved issues
 
-There are no resolved issues in this release.
+Describes any known issues resolved in the current release.
+
+- LDPC block encoding (`armral_ldpc_encode_block`), rate matching
+  (`armral_ldpc_rate_matching`) and rate recovery
+  (`armral_ldpc_rate_recovery`) now support the insertion and removal
+  of filler bits as described in the 3GPP Technical Specification (TS)
+  38.212.
 
 ## Known limitations
 
 Describes any known limitations of the current release.
 
-- LDPC block encoding (`armral_ldpc_encode_block`), rate matching
-  (`armral_ldpc_rate_matching`) and rate recovery
-  (`armral_ldpc_rate_recovery`) do not support the insertion and
-  removal of filler bits as described in as described in the 3GPP
-  Technical Specification (TS) 38.212.
+- There are no known limitations in this release.
 
 # Support
 
@@ -210,7 +210,7 @@ Acceleration Library:
 
 * A recent version of a C/C++ compiler, such as GCC. Arm RAN
   Acceleration Library has been tested with GCC 7.5.0, 8.5.0, 9.5.0,
-  10.4.0, 11.4.0, 12.3.0, and 13.1.0.
+  10.5.0, 11.4.0, 12.3.0, and 13.2.0.
 
   **Note:** If you are cross-compiling, you need a cross-toolchain compiler that
   targets AArch64. You can download open-source cross-toolchain builds of the
@@ -312,7 +312,7 @@ rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm’s trademark usage
 guidelines at https://www.arm.com/company/policies/trademarks.
 
-Copyright © 2020-2023 Arm Limited (or its affiliates). All rights reserved.
+Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
 Arm Limited. Company 02557590 registered in England.
 110 Fulbourn Road, Cambridge, England CB1 9NJ.
diff --git a/bench/CRC/11/BigEndian/bench.py b/bench/CRC/11/BigEndian/bench.py
index be032e5..6c6f668 100755
--- a/bench/CRC/11/BigEndian/bench.py
+++ b/bench/CRC/11/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/BigEndian/main.cpp b/bench/CRC/11/BigEndian/main.cpp
index b6835f8..d82dbd4 100644
--- a/bench/CRC/11/BigEndian/main.cpp
+++ b/bench/CRC/11/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/11/LittleEndian/bench.py b/bench/CRC/11/LittleEndian/bench.py
index 321d2ed..350c7ea 100755
--- a/bench/CRC/11/LittleEndian/bench.py
+++ b/bench/CRC/11/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/11/LittleEndian/main.cpp b/bench/CRC/11/LittleEndian/main.cpp
index 8a5760e..533b507 100644
--- a/bench/CRC/11/LittleEndian/main.cpp
+++ b/bench/CRC/11/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/BigEndian/bench.py b/bench/CRC/16/BigEndian/bench.py
index cb41579..8bf0fc0 100755
--- a/bench/CRC/16/BigEndian/bench.py
+++ b/bench/CRC/16/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/BigEndian/main.cpp b/bench/CRC/16/BigEndian/main.cpp
index ec28a53..a81ccf5 100644
--- a/bench/CRC/16/BigEndian/main.cpp
+++ b/bench/CRC/16/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/16/LittleEndian/bench.py b/bench/CRC/16/LittleEndian/bench.py
index 7ddf27a..4c8ce83 100755
--- a/bench/CRC/16/LittleEndian/bench.py
+++ b/bench/CRC/16/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/16/LittleEndian/main.cpp b/bench/CRC/16/LittleEndian/main.cpp
index a88e38c..ded10e8 100644
--- a/bench/CRC/16/LittleEndian/main.cpp
+++ b/bench/CRC/16/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/BigEndian/bench.py b/bench/CRC/24/A/BigEndian/bench.py
index 4d69b5f..a69cb7e 100755
--- a/bench/CRC/24/A/BigEndian/bench.py
+++ b/bench/CRC/24/A/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/BigEndian/main.cpp b/bench/CRC/24/A/BigEndian/main.cpp
index e970ae3..ee1e1c7 100644
--- a/bench/CRC/24/A/BigEndian/main.cpp
+++ b/bench/CRC/24/A/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/A/LittleEndian/bench.py b/bench/CRC/24/A/LittleEndian/bench.py
index e072699..576bafc 100755
--- a/bench/CRC/24/A/LittleEndian/bench.py
+++ b/bench/CRC/24/A/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/A/LittleEndian/main.cpp b/bench/CRC/24/A/LittleEndian/main.cpp
index 43d515f..17325f8 100644
--- a/bench/CRC/24/A/LittleEndian/main.cpp
+++ b/bench/CRC/24/A/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/BigEndian/bench.py b/bench/CRC/24/B/BigEndian/bench.py
index 217c6ff..aa31855 100755
--- a/bench/CRC/24/B/BigEndian/bench.py
+++ b/bench/CRC/24/B/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/BigEndian/main.cpp b/bench/CRC/24/B/BigEndian/main.cpp
index 3bb0e97..876deaf 100644
--- a/bench/CRC/24/B/BigEndian/main.cpp
+++ b/bench/CRC/24/B/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/B/LittleEndian/bench.py b/bench/CRC/24/B/LittleEndian/bench.py
index bcb362e..cbd7e95 100755
--- a/bench/CRC/24/B/LittleEndian/bench.py
+++ b/bench/CRC/24/B/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/B/LittleEndian/main.cpp b/bench/CRC/24/B/LittleEndian/main.cpp
index 480e9f5..b19eb35 100644
--- a/bench/CRC/24/B/LittleEndian/main.cpp
+++ b/bench/CRC/24/B/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/BigEndian/bench.py b/bench/CRC/24/C/BigEndian/bench.py
index b23ef53..42303ee 100755
--- a/bench/CRC/24/C/BigEndian/bench.py
+++ b/bench/CRC/24/C/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/BigEndian/main.cpp b/bench/CRC/24/C/BigEndian/main.cpp
index dc73062..e1a18f2 100644
--- a/bench/CRC/24/C/BigEndian/main.cpp
+++ b/bench/CRC/24/C/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/24/C/LittleEndian/bench.py b/bench/CRC/24/C/LittleEndian/bench.py
index a246f0a..331bb26 100755
--- a/bench/CRC/24/C/LittleEndian/bench.py
+++ b/bench/CRC/24/C/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/24/C/LittleEndian/main.cpp b/bench/CRC/24/C/LittleEndian/main.cpp
index 87177e9..d9c0a81 100644
--- a/bench/CRC/24/C/LittleEndian/main.cpp
+++ b/bench/CRC/24/C/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/BigEndian/bench.py b/bench/CRC/6/BigEndian/bench.py
index 41b955e..bb64225 100755
--- a/bench/CRC/6/BigEndian/bench.py
+++ b/bench/CRC/6/BigEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/BigEndian/main.cpp b/bench/CRC/6/BigEndian/main.cpp
index 7e7101b..b74b808 100644
--- a/bench/CRC/6/BigEndian/main.cpp
+++ b/bench/CRC/6/BigEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/CRC/6/LittleEndian/bench.py b/bench/CRC/6/LittleEndian/bench.py
index 3777eb9..7878f82 100755
--- a/bench/CRC/6/LittleEndian/bench.py
+++ b/bench/CRC/6/LittleEndian/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/CRC/6/LittleEndian/main.cpp b/bench/CRC/6/LittleEndian/main.cpp
index f1eb3ce..8363eae 100644
--- a/bench/CRC/6/LittleEndian/main.cpp
+++ b/bench/CRC/6/LittleEndian/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Decoding/bench.py b/bench/ConvCoding/Decoding/bench.py
index aa5dc8d..16ebdb0 100755
--- a/bench/ConvCoding/Decoding/bench.py
+++ b/bench/ConvCoding/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Decoding/main.cpp b/bench/ConvCoding/Decoding/main.cpp
index 1cb1cb4..8bc34dd 100644
--- a/bench/ConvCoding/Decoding/main.cpp
+++ b/bench/ConvCoding/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ConvCoding/Encoding/bench.py b/bench/ConvCoding/Encoding/bench.py
index 0615a12..fca556c 100755
--- a/bench/ConvCoding/Encoding/bench.py
+++ b/bench/ConvCoding/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ConvCoding/Encoding/main.cpp b/bench/ConvCoding/Encoding/main.cpp
index a343760..8221726 100644
--- a/bench/ConvCoding/Encoding/main.cpp
+++ b/bench/ConvCoding/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Correlation/bench.py b/bench/Correlation/bench.py
index cc9dfe4..9a36a39 100755
--- a/bench/Correlation/bench.py
+++ b/bench/Correlation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Correlation/main.cpp b/bench/Correlation/main.cpp
index 3438400..068172f 100644
--- a/bench/Correlation/main.cpp
+++ b/bench/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Demodulation/bench.py b/bench/Demodulation/bench.py
index 1740b99..051554d 100755
--- a/bench/Demodulation/bench.py
+++ b/bench/Demodulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Demodulation/main.cpp b/bench/Demodulation/main.cpp
index 03175d7..d6b9c92 100644
--- a/bench/Demodulation/main.cpp
+++ b/bench/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul16/bench.py b/bench/ElemWiseVectorMult/VecMul16/bench.py
index 813aa84..c8d40c3 100755
--- a/bench/ElemWiseVectorMult/VecMul16/bench.py
+++ b/bench/ElemWiseVectorMult/VecMul16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16/main.cpp b/bench/ElemWiseVectorMult/VecMul16/main.cpp
index a08b0fe..3c40c29 100644
--- a/bench/ElemWiseVectorMult/VecMul16/main.cpp
+++ b/bench/ElemWiseVectorMult/VecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/bench.py b/bench/ElemWiseVectorMult/VecMul16_2/bench.py
index b64bfa3..ee9f806 100755
--- a/bench/ElemWiseVectorMult/VecMul16_2/bench.py
+++ b/bench/ElemWiseVectorMult/VecMul16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp b/bench/ElemWiseVectorMult/VecMul16_2/main.cpp
index cb97179..758eece 100644
--- a/bench/ElemWiseVectorMult/VecMul16_2/main.cpp
+++ b/bench/ElemWiseVectorMult/VecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32/bench.py b/bench/ElemWiseVectorMult/VecMul32/bench.py
index a220f0e..b8007ef 100755
--- a/bench/ElemWiseVectorMult/VecMul32/bench.py
+++ b/bench/ElemWiseVectorMult/VecMul32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32/main.cpp b/bench/ElemWiseVectorMult/VecMul32/main.cpp
index 433a2c9..76c149a 100644
--- a/bench/ElemWiseVectorMult/VecMul32/main.cpp
+++ b/bench/ElemWiseVectorMult/VecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/bench.py b/bench/ElemWiseVectorMult/VecMul32_2/bench.py
index 382356f..ea08bf9 100755
--- a/bench/ElemWiseVectorMult/VecMul32_2/bench.py
+++ b/bench/ElemWiseVectorMult/VecMul32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp b/bench/ElemWiseVectorMult/VecMul32_2/main.cpp
index d947138..ec1a210 100644
--- a/bench/ElemWiseVectorMult/VecMul32_2/main.cpp
+++ b/bench/ElemWiseVectorMult/VecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT16/bench.py b/bench/FFT/FFT16/bench.py
index c7ee0ce..f560e04 100755
--- a/bench/FFT/FFT16/bench.py
+++ b/bench/FFT/FFT16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT16/main.cpp b/bench/FFT/FFT16/main.cpp
index 402b0a5..2ce24b0 100644
--- a/bench/FFT/FFT16/main.cpp
+++ b/bench/FFT/FFT16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FFT/FFT32/bench.py b/bench/FFT/FFT32/bench.py
index 3e1a30e..83a3e73 100755
--- a/bench/FFT/FFT32/bench.py
+++ b/bench/FFT/FFT32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FFT/FFT32/main.cpp b/bench/FFT/FFT32/main.cpp
index 1281790..d20456b 100644
--- a/bench/FFT/FFT32/main.cpp
+++ b/bench/FFT/FFT32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16/bench.py b/bench/FIR/FIR16/bench.py
index 63fa35f..f0b19e8 100755
--- a/bench/FIR/FIR16/bench.py
+++ b/bench/FIR/FIR16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16/main.cpp b/bench/FIR/FIR16/main.cpp
index 26a0287..58ee2c8 100644
--- a/bench/FIR/FIR16/main.cpp
+++ b/bench/FIR/FIR16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR16Decimate2/bench.py b/bench/FIR/FIR16Decimate2/bench.py
index 716a98d..956ca7c 100755
--- a/bench/FIR/FIR16Decimate2/bench.py
+++ b/bench/FIR/FIR16Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR16Decimate2/main.cpp b/bench/FIR/FIR16Decimate2/main.cpp
index 8e2ec7b..8b8265a 100644
--- a/bench/FIR/FIR16Decimate2/main.cpp
+++ b/bench/FIR/FIR16Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32/bench.py b/bench/FIR/FIR32/bench.py
index 731f0be..86757b6 100755
--- a/bench/FIR/FIR32/bench.py
+++ b/bench/FIR/FIR32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32/main.cpp b/bench/FIR/FIR32/main.cpp
index 4c35278..02e3b08 100644
--- a/bench/FIR/FIR32/main.cpp
+++ b/bench/FIR/FIR32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/FIR/FIR32Decimate2/bench.py b/bench/FIR/FIR32Decimate2/bench.py
index 3561ceb..41fc6c1 100755
--- a/bench/FIR/FIR32Decimate2/bench.py
+++ b/bench/FIR/FIR32Decimate2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/FIR/FIR32Decimate2/main.cpp b/bench/FIR/FIR32Decimate2/main.cpp
index 83d7da7..b663f19 100644
--- a/bench/FIR/FIR32Decimate2/main.cpp
+++ b/bench/FIR/FIR32Decimate2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/LDPC/Decoding/bench.py b/bench/LDPC/Decoding/bench.py
index 98e5ac0..620ad12 100755
--- a/bench/LDPC/Decoding/bench.py
+++ b/bench/LDPC/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/LDPC/Decoding/main.cpp b/bench/LDPC/Decoding/main.cpp
index cab11b2..9d26974 100755
--- a/bench/LDPC/Decoding/main.cpp
+++ b/bench/LDPC/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -33,13 +33,13 @@ void run_ldpc_decoding_perf(armral_ldpc_graph_t bg, uint32_t z,
   std::vector<uint8_t> buffer(buffer_size);
   for (uint32_t r = 0; r < num_reps; ++r) {
     buffer_bump_allocator allocator{buffer.data()};
-    armral_ldpc::decode_block<false, buffer_bump_allocator>(
+    armral::ldpc::decode_block<false, buffer_bump_allocator>(
         llr_ptr, bg, z, crc_idx, num_its, out_ptr, allocator);
   }
 #else
   for (uint32_t r = 0; r < num_reps; ++r) {
     heap_allocator allocator{};
-    armral_ldpc::decode_block<false, heap_allocator>(
+    armral::ldpc::decode_block<false, heap_allocator>(
         llr_ptr, bg, z, crc_idx, num_its, out_ptr, allocator);
   }
 #endif
diff --git a/bench/LDPC/Encoding/bench.py b/bench/LDPC/Encoding/bench.py
index 7bfc65b..dd8f9d4 100755
--- a/bench/LDPC/Encoding/bench.py
+++ b/bench/LDPC/Encoding/bench.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
 import json
 import itertools
 from pathlib import Path
@@ -11,10 +12,12 @@ def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x)
 
 
 exe_name = get_path("bench_ldpc_encoding")
+
 j = {
     "exe_name": exe_name,
     "cases": []
 }
+
 base_graphs = [1, 2]
 lifting_sizes = [2, 11, 16, 18, 30, 36, 52, 112, 160, 208, 384]
 len_filler_bits = [0, 0, 0, 76, 0, 0, 0, 72, 0, 0, 0]
diff --git a/bench/LDPC/Encoding/main.cpp b/bench/LDPC/Encoding/main.cpp
index 1c3c310..d7c075a 100644
--- a/bench/LDPC/Encoding/main.cpp
+++ b/bench/LDPC/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
@@ -50,7 +50,7 @@ int main(int argc, char **argv) {
     //              This gets converted into the enum representing the base
     //              graph 1 -> LDPC_BASE_GRAPH_1 2 -> LDPC_BASE_GRAPH_2
     // lifting_size:  The lifting size Z to use in the block encoding
-    // len_filler_bits:  Length of filler bits As per section 5.2.2 of TS 38.212
+    // len_filler_bits:  Length of filler bits as per section 5.2.2 of TS 38.212
     // num_reps:      The number of times to repeat the encoding, so as to get a
     //                stable performance number
     printf("Usage: %s base_graph lifting_size len_filler_bits num_reps\n",
diff --git a/bench/LDPC/RateMatching/bench.py b/bench/LDPC/RateMatching/bench.py
new file mode 100755
index 0000000..5d752ec
--- /dev/null
+++ b/bench/LDPC/RateMatching/bench.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# Arm RAN Acceleration Library
+# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+import json
+from pathlib import Path
+import os
+
+
+def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x)
+
+
+exe_name = get_path("bench_ldpc_rate_matching")
+
+j = {
+    "exe_name": exe_name,
+    "cases": []
+}
+
+para_list = [
+    (1, 2, 44, 25344, 0, 0),
+    (1, 11, 242, 25344, 0, 0),
+    (1, 112, 2464, 25344, 0, 0),
+    (1, 208, 4576, 25344, 0, 0),
+    (1, 384, 8448, 25344, 0, 0),
+    (2, 2, 22, 19200, 0, 0),
+    (2, 11, 112, 19200, 0, 0),
+    (2, 112, 1232, 19200, 0, 0),
+    (2, 208, 2288, 19200, 0, 0),
+    (2, 384, 4224, 19200, 0, 0)
+]
+
+# We scale the number of reps according to the lifting size.
+target_reps = 150000
+
+for bg, z, e, nref, rv, mod in para_list:
+    case = {
+        "name": "ldpc_rate_matching_bg{}_z{}_e{}_nref{}_rv{}_mod{}".format(
+            bg, z, e, nref, rv, mod
+        ),
+        "args": "{} {} {} {} {} {}".format(bg, z, e, nref, rv, mod),
+        "reps": target_reps * 2 // z,
+    }
+    j["cases"].append(case)
+print(json.dumps(j))
diff --git a/bench/LDPC/RateMatching/main.cpp b/bench/LDPC/RateMatching/main.cpp
new file mode 100644
index 0000000..d99459a
--- /dev/null
+++ b/bench/LDPC/RateMatching/main.cpp
@@ -0,0 +1,79 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#include "armral.h"
+#include "ldpc_coding.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+namespace {
+
+void run_ldpc_rate_matching_perf(armral_ldpc_graph_t bg, uint32_t z, uint32_t e,
+                                 uint32_t nref, uint32_t rv,
+                                 armral_modulation_type mod, uint32_t reps) {
+  printf("[LDPC RATE MATCHING] bg = %u, z = %u, e = %u, nref = %u, rv = %u, "
+         "mod = %u, number of repetitions = %u\n",
+         (uint32_t)bg, z, e, nref, rv, (uint32_t)mod, reps);
+
+  const auto *graph = armral_ldpc_get_base_graph(bg);
+
+  uint32_t len_filler_bits = 0;
+  uint32_t in_size = graph->nmessage_bits * z;
+  uint32_t out_size = (graph->ncodeword_bits + 2) * z;
+
+  std::vector<uint8_t> in((in_size + 7) / 8);
+  std::vector<uint8_t> out((out_size + 7) / 8);
+  const auto *in_ptr = in.data();
+  auto *out_ptr = out.data();
+#ifdef ARMRAL_BENCH_NOALLOC
+  std::vector<uint8_t> buffer((2 * z * 66) + e);
+  auto *buffer_ptr = buffer.data();
+
+  for (uint32_t r = 0; r < reps; ++r) {
+    armral_ldpc_rate_matching_noalloc(bg, z, e, nref, len_filler_bits, in_size,
+                                      rv, mod, in_ptr, out_ptr, buffer_ptr);
+  }
+#else
+  for (uint32_t r = 0; r < reps; ++r) {
+    armral_ldpc_rate_matching(bg, z, e, nref, len_filler_bits, in_size, rv, mod,
+                              in_ptr, out_ptr);
+  }
+#endif
+}
+
+} // anonymous namespace
+
+int main(int argc, char **argv) {
+  if (argc != 8) {
+    // base_graph:    Integer representing the base graph to use.
+    //                This gets converted into the enum representing the base
+    //                graph 1 -> LDPC_BASE_GRAPH_1 2 -> LDPC_BASE_GRAPH_2
+    // lifting_size:  The lifting size Z.
+    // e:             The number of bits in the rate-matched message.
+    //                This is assumed to be a multiple of the number of
+    //                bits per modulation symbol.
+    // nref:          The soft buffer size for limited buffer rate matching.
+    // rv:            Redundancy version used in rate matching.
+    //                Must be in the set {0, 1, 2, 3}.
+    // mod:           The type of modulation to perform.
+    // num_reps:      The number of times to repeat the function.
+    printf("Usage: %s base_graph lifting_size e nref rv mod num_reps\n",
+           argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  auto bg = (armral_ldpc_graph_t)(atoi(argv[1]) - 1);
+  auto z = (uint32_t)atoi(argv[2]);
+  auto e = (uint32_t)atoi(argv[3]);
+  auto nref = (uint32_t)atoi(argv[4]);
+  auto rv = (uint32_t)atoi(argv[5]);
+  auto mod = (armral_modulation_type)atoi(argv[6]);
+  auto reps = (uint32_t)atoi(argv[7]);
+
+  run_ldpc_rate_matching_perf(bg, z, e, nref, rv, mod, reps);
+
+  return EXIT_SUCCESS;
+}
diff --git a/bench/LDPC/RateRecovery/bench.py b/bench/LDPC/RateRecovery/bench.py
new file mode 100755
index 0000000..02463ca
--- /dev/null
+++ b/bench/LDPC/RateRecovery/bench.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# Arm RAN Acceleration Library
+# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+import json
+from pathlib import Path
+import os
+
+
+def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x)
+
+
+exe_name = get_path("bench_ldpc_rate_recovery")
+
+j = {
+    "exe_name": exe_name,
+    "cases": []
+}
+
+para_list = [
+    (1, 2, 44, 25344, 0, 0),
+    (1, 11, 242, 25344, 0, 0),
+    (1, 112, 2464, 25344, 0, 0),
+    (1, 208, 4576, 25344, 0, 0),
+    (1, 384, 8448, 25344, 0, 0),
+    (2, 2, 22, 19200, 0, 0),
+    (2, 11, 112, 19200, 0, 0),
+    (2, 112, 1232, 19200, 0, 0),
+    (2, 208, 2288, 19200, 0, 0),
+    (2, 384, 4224, 19200, 0, 0)
+]
+
+# We scale the number of reps according to the lifting size.
+target_reps = 150000
+
+for bg, z, e, nref, rv, mod in para_list:
+    case = {
+        "name": "ldpc_rate_recovery_bg{}_z{}_e{}_nref{}_rv{}_mod{}".format(
+            bg, z, e, nref, rv, mod
+        ),
+        "args": "{} {} {} {} {} {}".format(
+            bg, z, e, nref, rv, mod),
+        "reps": target_reps * 2 // z,
+    }
+    j["cases"].append(case)
+print(json.dumps(j))
diff --git a/bench/LDPC/RateRecovery/main.cpp b/bench/LDPC/RateRecovery/main.cpp
new file mode 100644
index 0000000..af9e056
--- /dev/null
+++ b/bench/LDPC/RateRecovery/main.cpp
@@ -0,0 +1,74 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#include "armral.h"
+#include "ldpc_coding.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+namespace {
+
+void run_ldpc_rate_recovery_perf(armral_ldpc_graph_t bg, uint32_t z, uint32_t e,
+                                 uint32_t nref, uint32_t rv,
+                                 armral_modulation_type mod, uint32_t reps) {
+  printf("[LDPC RATE RECOVERY] bg = %u, z = %u, e = %u, nref = %u, rv = %u, "
+         "mod = %u, number of repetitions = %u\n",
+         (uint32_t)bg, z, e, nref, rv, (uint32_t)mod, reps);
+
+  uint32_t len_filler_bits = 0;
+  uint32_t n = (bg == LDPC_BASE_GRAPH_2) ? 50 * z : 66 * z;
+  std::vector<int8_t> in(n);
+  std::vector<int8_t> out(n);
+  const auto *in_ptr = in.data();
+  auto *out_ptr = out.data();
+
+#ifdef ARMRAL_BENCH_NOALLOC
+  std::vector<uint8_t> buffer((z * 66) + e);
+  auto *buffer_ptr = buffer.data();
+
+  for (uint32_t r = 0; r < reps; ++r) {
+    armral_ldpc_rate_recovery_noalloc(bg, z, e, nref, len_filler_bits, n, rv,
+                                      mod, in_ptr, out_ptr, buffer_ptr);
+  }
+#else
+  for (uint32_t r = 0; r < reps; ++r) {
+    armral_ldpc_rate_recovery(bg, z, e, nref, len_filler_bits, n, rv, mod,
+                              in_ptr, out_ptr);
+  }
+#endif
+}
+
+} // anonymous namespace
+
+int main(int argc, char **argv) {
+  if (argc != 8) {
+    // base_graph:    Integer representing the base graph to use.
+    //                This gets converted into the enum representing the base
+    //                graph 1 -> LDPC_BASE_GRAPH_1 2 -> LDPC_BASE_GRAPH_2
+    // lifting_size:  The lifting size Z.
+    // e:             The number of LLRs in the demodulated message.
+    // nref:          The soft buffer size for limited buffer rate recovery.
+    // rv:            Redundancy version used in rate recovery.
+    //                Must be in the set {0, 1, 2, 3}.
+    // mod:           The type of modulation which was performed.
+    // num_reps:      The number of times to repeat the function.
+    printf("Usage: %s base_graph lifting_size e nref rv mod num_reps\n",
+           argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  auto bg = (armral_ldpc_graph_t)(atoi(argv[1]) - 1);
+  auto z = (uint32_t)atoi(argv[2]);
+  auto e = (uint32_t)atoi(argv[3]);
+  auto nref = (uint32_t)atoi(argv[4]);
+  auto rv = (uint32_t)atoi(argv[5]);
+  auto mod = (armral_modulation_type)atoi(argv[6]);
+  auto reps = (uint32_t)atoi(argv[7]);
+
+  run_ldpc_rate_recovery_perf(bg, z, e, nref, rv, mod, reps);
+
+  return EXIT_SUCCESS;
+}
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py b/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
index dee5845..74414bb 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
+++ b/bench/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp b/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
index 9c3505f..32847b4 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
+++ b/bench/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py b/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
index e52b7fb..aeaf28c 100755
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
+++ b/bench/MatrixInv/Batch/GeneralMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp b/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
index 83ffe65..0f27c83 100644
--- a/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
+++ b/bench/MatrixInv/Batch/GeneralMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py b/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
index ee297ba..8710c18 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
+++ b/bench/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp b/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
index 401f767..8c66a87 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
+++ b/bench/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py b/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
index 2994dbb..eaf5b9b 100755
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
+++ b/bench/MatrixInv/Batch/HermitianMatInv/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp b/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
index 8db0c70..6536ea8 100644
--- a/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
+++ b/bench/MatrixInv/Batch/HermitianMatInv/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/GeneralMatInv/bench.py b/bench/MatrixInv/Single/GeneralMatInv/bench.py
index 88328f2..3903a1b 100755
--- a/bench/MatrixInv/Single/GeneralMatInv/bench.py
+++ b/bench/MatrixInv/Single/GeneralMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/GeneralMatInv/main.cpp b/bench/MatrixInv/Single/GeneralMatInv/main.cpp
index 6847b07..94e63e0 100644
--- a/bench/MatrixInv/Single/GeneralMatInv/main.cpp
+++ b/bench/MatrixInv/Single/GeneralMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixInv/Single/HermitianMatInv/bench.py b/bench/MatrixInv/Single/HermitianMatInv/bench.py
index 482800f..f0d7e5b 100755
--- a/bench/MatrixInv/Single/HermitianMatInv/bench.py
+++ b/bench/MatrixInv/Single/HermitianMatInv/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixInv/Single/HermitianMatInv/main.cpp b/bench/MatrixInv/Single/HermitianMatInv/main.cpp
index 9d05359..c9b708f 100644
--- a/bench/MatrixInv/Single/HermitianMatInv/main.cpp
+++ b/bench/MatrixInv/Single/HermitianMatInv/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "matrix_utils.hpp"
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py b/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
index d4a1e39..e7edca6 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
+++ b/bench/MatrixMult/Batch/ArmSolve/1x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp b/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
index 5f82b28..9b0453f 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
+++ b/bench/MatrixMult/Batch/ArmSolve/1x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py b/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
index cbc5866..d190973 100755
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
+++ b/bench/MatrixMult/Batch/ArmSolve/1x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp b/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
index 135427a..1b8bd8b 100644
--- a/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
+++ b/bench/MatrixMult/Batch/ArmSolve/1x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py b/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
index b3f69b1..aee2667 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
+++ b/bench/MatrixMult/Batch/ArmSolve/2x2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp b/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
index d14a059..b554c1f 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
+++ b/bench/MatrixMult/Batch/ArmSolve/2x2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py b/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
index 697939b..9dd2030 100755
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
+++ b/bench/MatrixMult/Batch/ArmSolve/2x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp b/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
index 80fa307..7637055 100644
--- a/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
+++ b/bench/MatrixMult/Batch/ArmSolve/2x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py b/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
index 0973b25..ee929f2 100755
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
+++ b/bench/MatrixMult/Batch/ArmSolve/4x4/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp b/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
index 5aa2b0f..4082649 100644
--- a/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
+++ b/bench/MatrixMult/Batch/ArmSolve/4x4/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
index 6b745e0..8549eab 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
index 333fef3..6d0006f 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
index 99baf98..4ecaa28 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
index ab7f760..85b3f96 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
index 491754e..621a7a0 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
index f7cbfd7..63034a6 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
index cb8327f..5e88789 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
index 74252bf..bdfbd19 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
index 4e1dc7c..0cfde51 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
index 4002b90..3344fe6 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py b/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
index a350fe3..78f0bae 100755
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
+++ b/bench/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import itertools
 import json
diff --git a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp b/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
index a42ddaa..0aa6934 100644
--- a/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
+++ b/bench/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py b/bench/MatrixMult/Single/MatrixMult16/32b/bench.py
index 0976b82..5e4312e 100755
--- a/bench/MatrixMult/Single/MatrixMult16/32b/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult16/32b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp b/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
index 6309d53..3462e61 100644
--- a/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult16/32b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py b/bench/MatrixMult/Single/MatrixMult16/64b/bench.py
index 2f8b3ea..ef4ad39 100755
--- a/bench/MatrixMult/Single/MatrixMult16/64b/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult16/64b/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp b/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
index b4a686a..a1ebda8 100644
--- a/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult16/64b/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py b/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
index b8be894..6125bad 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp b/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
index fdc8a19..890a174 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py b/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
index cb0f7cf..f58fc63 100755
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp b/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
index 7df291a..74ab2b7 100644
--- a/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py b/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
index 8787e70..c7dd1f9 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp b/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
index 3d5da96..3ccd0ce 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py b/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
index 23edd2b..626b618 100755
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp b/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
index 115789e..d0eb869 100644
--- a/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/bench.py b/bench/MatrixMult/Single/MatrixMult32/general/bench.py
index 2ef90b6..cae82ac 100755
--- a/bench/MatrixMult/Single/MatrixMult32/general/bench.py
+++ b/bench/MatrixMult/Single/MatrixMult32/general/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp b/bench/MatrixMult/Single/MatrixMult32/general/main.cpp
index d15b3ee..f37000d 100644
--- a/bench/MatrixMult/Single/MatrixMult32/general/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMult32/general/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py b/bench/MatrixMult/Single/MatrixMultAAH32/bench.py
index 02344b0..c911c26 100755
--- a/bench/MatrixMult/Single/MatrixMultAAH32/bench.py
+++ b/bench/MatrixMult/Single/MatrixMultAAH32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp b/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
index fdee30a..5ebdcf1 100644
--- a/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMultAAH32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py b/bench/MatrixMult/Single/MatrixMultAHB32/bench.py
index ffee1cb..9a58a3d 100755
--- a/bench/MatrixMult/Single/MatrixMultAHB32/bench.py
+++ b/bench/MatrixMult/Single/MatrixMultAHB32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp b/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
index cd7693b..1fee878 100644
--- a/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
+++ b/bench/MatrixMult/Single/MatrixMultAHB32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py b/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
index e63f739..c15b3ab 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
+++ b/bench/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp b/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
index db470ad..63e49a2 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
+++ b/bench/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py b/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
index 14faaa0..4ada38d 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
+++ b/bench/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp b/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
index b9da2d4..8c0c7b1 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
+++ b/bench/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py b/bench/MatrixMult/Single/MatrixVectorMult32/bench.py
index 52b224f..fb5e762 100755
--- a/bench/MatrixMult/Single/MatrixVectorMult32/bench.py
+++ b/bench/MatrixMult/Single/MatrixVectorMult32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp b/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
index 36f16aa..9cc90ba 100644
--- a/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
+++ b/bench/MatrixMult/Single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
diff --git a/bench/MatrixPseudoInv/Direct/bench.py b/bench/MatrixPseudoInv/Direct/bench.py
index a8dbdd3..fcfb462 100755
--- a/bench/MatrixPseudoInv/Direct/bench.py
+++ b/bench/MatrixPseudoInv/Direct/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
@@ -20,10 +20,10 @@ j = {
     "cases": []
 }
 
+size1 = [2, 3, 4, 8, 16]
+size2 = [32, 64, 128, 256]
 
-rows = [2, 3, 4, 8, 16]
-cols = [32, 64, 128, 256]
-for (m, n) in itertools.product(rows, cols):
+for (m, n) in itertools.chain(zip(size1, size2), zip(size2, size1)):
     case = {
         "name": "mat_pseudo_inv_direct_{}_{}".format(m, n),
         "args": "{} {}".format(m, n),
diff --git a/bench/MatrixPseudoInv/Direct/main.cpp b/bench/MatrixPseudoInv/Direct/main.cpp
index e5c24ab..6339c24 100644
--- a/bench/MatrixPseudoInv/Direct/main.cpp
+++ b/bench/MatrixPseudoInv/Direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -26,7 +26,8 @@ void run_mat_pinv_perf(uint32_t m, uint32_t n, uint32_t num_reps) {
 #ifdef ARMRAL_BENCH_NOALLOC
   // Benchmark only added for interest. This is not expected to show any major
   // performance difference.
-  std::vector<uint8_t> buffer(m * m * sizeof(armral_cmplx_f32_t) + 3);
+  auto size = m > n ? n : m;
+  std::vector<uint8_t> buffer(size * size * sizeof(armral_cmplx_f32_t) + 3);
   for (uint32_t i = 0; i < num_reps; ++i) {
     armral_cmplx_pseudo_inverse_direct_f32_noalloc(m, n, lambda, in_ptr,
                                                    out_ptr, buffer.data());
@@ -52,7 +53,8 @@ int main(int argc, char **argv) {
   auto n = (uint32_t)atoi(argv[2]);
   auto num_reps = (uint32_t)atoi(argv[3]);
 
-  assert(m == 2 || m == 3 || m == 4 || m == 8 || m == 16);
+  [[maybe_unused]] auto size = m > n ? n : m;
+  assert(size == 2 || size == 3 || size == 4 || size == 8 || size == 16);
 
   run_mat_pinv_perf(m, n, num_reps);
 
diff --git a/bench/Modulation/bench.py b/bench/Modulation/bench.py
index 4718fb8..e6dcff6 100755
--- a/bench/Modulation/bench.py
+++ b/bench/Modulation/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Modulation/main.cpp b/bench/Modulation/main.cpp
index 4e836ac..5e0f7ba 100644
--- a/bench/Modulation/main.cpp
+++ b/bench/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/14bit/bench.py b/bench/MuLaw/Compression/14bit/bench.py
index c222bf3..2b9ab70 100755
--- a/bench/MuLaw/Compression/14bit/bench.py
+++ b/bench/MuLaw/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/14bit/main.cpp b/bench/MuLaw/Compression/14bit/main.cpp
index 3d60c30..73b7699 100644
--- a/bench/MuLaw/Compression/14bit/main.cpp
+++ b/bench/MuLaw/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/8bit/bench.py b/bench/MuLaw/Compression/8bit/bench.py
index a7aa8e0..43cefd8 100755
--- a/bench/MuLaw/Compression/8bit/bench.py
+++ b/bench/MuLaw/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/8bit/main.cpp b/bench/MuLaw/Compression/8bit/main.cpp
index fbf6c83..2faa911 100644
--- a/bench/MuLaw/Compression/8bit/main.cpp
+++ b/bench/MuLaw/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Compression/9bit/bench.py b/bench/MuLaw/Compression/9bit/bench.py
index f7f54fe..cc24e67 100755
--- a/bench/MuLaw/Compression/9bit/bench.py
+++ b/bench/MuLaw/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Compression/9bit/main.cpp b/bench/MuLaw/Compression/9bit/main.cpp
index fd1641b..a2c1118 100644
--- a/bench/MuLaw/Compression/9bit/main.cpp
+++ b/bench/MuLaw/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/14bit/bench.py b/bench/MuLaw/Decompression/14bit/bench.py
index 7b65e7f..8f6d2b1 100755
--- a/bench/MuLaw/Decompression/14bit/bench.py
+++ b/bench/MuLaw/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/14bit/main.cpp b/bench/MuLaw/Decompression/14bit/main.cpp
index d8f8cc1..a24bf21 100644
--- a/bench/MuLaw/Decompression/14bit/main.cpp
+++ b/bench/MuLaw/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/8bit/bench.py b/bench/MuLaw/Decompression/8bit/bench.py
index 00419a6..f70ecaf 100755
--- a/bench/MuLaw/Decompression/8bit/bench.py
+++ b/bench/MuLaw/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/8bit/main.cpp b/bench/MuLaw/Decompression/8bit/main.cpp
index 0d9b6ca..c3a0f0a 100644
--- a/bench/MuLaw/Decompression/8bit/main.cpp
+++ b/bench/MuLaw/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/MuLaw/Decompression/9bit/bench.py b/bench/MuLaw/Decompression/9bit/bench.py
index cd0e939..67512df 100755
--- a/bench/MuLaw/Decompression/9bit/bench.py
+++ b/bench/MuLaw/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/MuLaw/Decompression/9bit/main.cpp b/bench/MuLaw/Decompression/9bit/main.cpp
index 0d18222..2bcde05 100644
--- a/bench/MuLaw/Decompression/9bit/main.cpp
+++ b/bench/MuLaw/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/14bit/bench.py b/bench/ORanBlockScaling/Compression/14bit/bench.py
index 845568c..e2b2f15 100755
--- a/bench/ORanBlockScaling/Compression/14bit/bench.py
+++ b/bench/ORanBlockScaling/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/14bit/main.cpp b/bench/ORanBlockScaling/Compression/14bit/main.cpp
index e1c7b68..37f8da1 100644
--- a/bench/ORanBlockScaling/Compression/14bit/main.cpp
+++ b/bench/ORanBlockScaling/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/8bit/bench.py b/bench/ORanBlockScaling/Compression/8bit/bench.py
index 03f98d2..65d5537 100755
--- a/bench/ORanBlockScaling/Compression/8bit/bench.py
+++ b/bench/ORanBlockScaling/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/8bit/main.cpp b/bench/ORanBlockScaling/Compression/8bit/main.cpp
index 706b471..43286ca 100644
--- a/bench/ORanBlockScaling/Compression/8bit/main.cpp
+++ b/bench/ORanBlockScaling/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Compression/9bit/bench.py b/bench/ORanBlockScaling/Compression/9bit/bench.py
index ea6d874..54f9931 100755
--- a/bench/ORanBlockScaling/Compression/9bit/bench.py
+++ b/bench/ORanBlockScaling/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Compression/9bit/main.cpp b/bench/ORanBlockScaling/Compression/9bit/main.cpp
index f203d3b..7d66f42 100644
--- a/bench/ORanBlockScaling/Compression/9bit/main.cpp
+++ b/bench/ORanBlockScaling/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/14bit/bench.py b/bench/ORanBlockScaling/Decompression/14bit/bench.py
index 347eb5c..cbb57d2 100755
--- a/bench/ORanBlockScaling/Decompression/14bit/bench.py
+++ b/bench/ORanBlockScaling/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/14bit/main.cpp b/bench/ORanBlockScaling/Decompression/14bit/main.cpp
index a2852e6..a9448f9 100644
--- a/bench/ORanBlockScaling/Decompression/14bit/main.cpp
+++ b/bench/ORanBlockScaling/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/8bit/bench.py b/bench/ORanBlockScaling/Decompression/8bit/bench.py
index fdcdc64..2807325 100755
--- a/bench/ORanBlockScaling/Decompression/8bit/bench.py
+++ b/bench/ORanBlockScaling/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/8bit/main.cpp b/bench/ORanBlockScaling/Decompression/8bit/main.cpp
index c60c504..60ffeec 100644
--- a/bench/ORanBlockScaling/Decompression/8bit/main.cpp
+++ b/bench/ORanBlockScaling/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/ORanBlockScaling/Decompression/9bit/bench.py b/bench/ORanBlockScaling/Decompression/9bit/bench.py
index 5510639..f16d82d 100755
--- a/bench/ORanBlockScaling/Decompression/9bit/bench.py
+++ b/bench/ORanBlockScaling/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/ORanBlockScaling/Decompression/9bit/main.cpp b/bench/ORanBlockScaling/Decompression/9bit/main.cpp
index 0ead485..8cbab20 100644
--- a/bench/ORanBlockScaling/Decompression/9bit/main.cpp
+++ b/bench/ORanBlockScaling/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Decoding/bench.py b/bench/Polar/Decoding/bench.py
index b69cda6..b9b3ad6 100755
--- a/bench/Polar/Decoding/bench.py
+++ b/bench/Polar/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Decoding/main.cpp b/bench/Polar/Decoding/main.cpp
index fe12c2f..6da1928 100644
--- a/bench/Polar/Decoding/main.cpp
+++ b/bench/Polar/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "rng.hpp"
diff --git a/bench/Polar/Encoding/bench.py b/bench/Polar/Encoding/bench.py
index 8c801cb..d01db8b 100755
--- a/bench/Polar/Encoding/bench.py
+++ b/bench/Polar/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Encoding/main.cpp b/bench/Polar/Encoding/main.cpp
index e3b1a69..86bd403 100644
--- a/bench/Polar/Encoding/main.cpp
+++ b/bench/Polar/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/Frozen/bench.py b/bench/Polar/Frozen/bench.py
index cc3eab0..c25d3c8 100755
--- a/bench/Polar/Frozen/bench.py
+++ b/bench/Polar/Frozen/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/Frozen/main.cpp b/bench/Polar/Frozen/main.cpp
index 6cd66bd..8db346a 100644
--- a/bench/Polar/Frozen/main.cpp
+++ b/bench/Polar/Frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateMatching/bench.py b/bench/Polar/RateMatching/bench.py
index 8286ded..fa5715f 100755
--- a/bench/Polar/RateMatching/bench.py
+++ b/bench/Polar/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateMatching/main.cpp b/bench/Polar/RateMatching/main.cpp
index 4ba4827..af6a831 100644
--- a/bench/Polar/RateMatching/main.cpp
+++ b/bench/Polar/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/RateRecovery/bench.py b/bench/Polar/RateRecovery/bench.py
index 1427393..4687b6d 100755
--- a/bench/Polar/RateRecovery/bench.py
+++ b/bench/Polar/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/RateRecovery/main.cpp b/bench/Polar/RateRecovery/main.cpp
index 51c32b4..b687110 100644
--- a/bench/Polar/RateRecovery/main.cpp
+++ b/bench/Polar/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelDeinterleave/bench.py b/bench/Polar/SubchannelDeinterleave/bench.py
index ff19ba9..d804d3b 100755
--- a/bench/Polar/SubchannelDeinterleave/bench.py
+++ b/bench/Polar/SubchannelDeinterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelDeinterleave/main.cpp b/bench/Polar/SubchannelDeinterleave/main.cpp
index b992f58..54e9108 100644
--- a/bench/Polar/SubchannelDeinterleave/main.cpp
+++ b/bench/Polar/SubchannelDeinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Polar/SubchannelInterleave/bench.py b/bench/Polar/SubchannelInterleave/bench.py
index d3f752e..8620391 100755
--- a/bench/Polar/SubchannelInterleave/bench.py
+++ b/bench/Polar/SubchannelInterleave/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Polar/SubchannelInterleave/main.cpp b/bench/Polar/SubchannelInterleave/main.cpp
index 63777bc..c2623be 100644
--- a/bench/Polar/SubchannelInterleave/main.cpp
+++ b/bench/Polar/SubchannelInterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/SVD/bench.py b/bench/SVD/bench.py
index fef7e94..22a8591 100755
--- a/bench/SVD/bench.py
+++ b/bench/SVD/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/SVD/main.cpp b/bench/SVD/main.cpp
index 67f5035..86cba92 100644
--- a/bench/SVD/main.cpp
+++ b/bench/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/bench/Scrambling/bench.py b/bench/Scrambling/bench.py
index e37a92f..ae4e285 100755
--- a/bench/Scrambling/bench.py
+++ b/bench/Scrambling/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Scrambling/main.cpp b/bench/Scrambling/main.cpp
index 3802388..6d85a8f 100644
--- a/bench/Scrambling/main.cpp
+++ b/bench/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/SeqGenerator/bench.py b/bench/SeqGenerator/bench.py
index a399b28..7d8ae27 100755
--- a/bench/SeqGenerator/bench.py
+++ b/bench/SeqGenerator/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/SeqGenerator/main.cpp b/bench/SeqGenerator/main.cpp
index 6e78e10..49baa2a 100644
--- a/bench/SeqGenerator/main.cpp
+++ b/bench/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/bench/Turbo/Decoding/bench.py b/bench/Turbo/Decoding/bench.py
index b2b4106..ebd3e38 100755
--- a/bench/Turbo/Decoding/bench.py
+++ b/bench/Turbo/Decoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Decoding/main.cpp b/bench/Turbo/Decoding/main.cpp
index d77b2f8..b0e21bb 100644
--- a/bench/Turbo/Decoding/main.cpp
+++ b/bench/Turbo/Decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
diff --git a/bench/Turbo/Encoding/bench.py b/bench/Turbo/Encoding/bench.py
index c9a2575..5c1db10 100755
--- a/bench/Turbo/Encoding/bench.py
+++ b/bench/Turbo/Encoding/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import itertools
diff --git a/bench/Turbo/Encoding/main.cpp b/bench/Turbo/Encoding/main.cpp
index 185d0ea..b79df85 100644
--- a/bench/Turbo/Encoding/main.cpp
+++ b/bench/Turbo/Encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateMatching/bench.py b/bench/Turbo/RateMatching/bench.py
index b539f7b..a36a1ea 100755
--- a/bench/Turbo/RateMatching/bench.py
+++ b/bench/Turbo/RateMatching/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateMatching/main.cpp b/bench/Turbo/RateMatching/main.cpp
index ed62282..809bf14 100644
--- a/bench/Turbo/RateMatching/main.cpp
+++ b/bench/Turbo/RateMatching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/Turbo/RateRecovery/bench.py b/bench/Turbo/RateRecovery/bench.py
index 76107bf..2cc54c2 100755
--- a/bench/Turbo/RateRecovery/bench.py
+++ b/bench/Turbo/RateRecovery/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/Turbo/RateRecovery/main.cpp b/bench/Turbo/RateRecovery/main.cpp
index 42974af..61d0e78 100644
--- a/bench/Turbo/RateRecovery/main.cpp
+++ b/bench/Turbo/RateRecovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16/bench.py b/bench/VectorDotProd/VecDot16/bench.py
index 66928a5..4c4bacd 100755
--- a/bench/VectorDotProd/VecDot16/bench.py
+++ b/bench/VectorDotProd/VecDot16/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16/main.cpp b/bench/VectorDotProd/VecDot16/main.cpp
index a77e7ac..4d2179f 100644
--- a/bench/VectorDotProd/VecDot16/main.cpp
+++ b/bench/VectorDotProd/VecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2/bench.py b/bench/VectorDotProd/VecDot16_2/bench.py
index 56c75a2..18d099c 100755
--- a/bench/VectorDotProd/VecDot16_2/bench.py
+++ b/bench/VectorDotProd/VecDot16_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2/main.cpp b/bench/VectorDotProd/VecDot16_2/main.cpp
index 31a2a53..356bcfb 100644
--- a/bench/VectorDotProd/VecDot16_2/main.cpp
+++ b/bench/VectorDotProd/VecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/bench.py b/bench/VectorDotProd/VecDot16_2_32bit/bench.py
index d016bf0..3f1b23a 100755
--- a/bench/VectorDotProd/VecDot16_2_32bit/bench.py
+++ b/bench/VectorDotProd/VecDot16_2_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp b/bench/VectorDotProd/VecDot16_2_32bit/main.cpp
index d942ae2..2e6377e 100644
--- a/bench/VectorDotProd/VecDot16_2_32bit/main.cpp
+++ b/bench/VectorDotProd/VecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot16_32bit/bench.py b/bench/VectorDotProd/VecDot16_32bit/bench.py
index af15ac1..2dd7bdd 100755
--- a/bench/VectorDotProd/VecDot16_32bit/bench.py
+++ b/bench/VectorDotProd/VecDot16_32bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot16_32bit/main.cpp b/bench/VectorDotProd/VecDot16_32bit/main.cpp
index a2fb763..0a0f27d 100644
--- a/bench/VectorDotProd/VecDot16_32bit/main.cpp
+++ b/bench/VectorDotProd/VecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32/bench.py b/bench/VectorDotProd/VecDot32/bench.py
index 18a9f1f..13764c4 100755
--- a/bench/VectorDotProd/VecDot32/bench.py
+++ b/bench/VectorDotProd/VecDot32/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32/main.cpp b/bench/VectorDotProd/VecDot32/main.cpp
index 9de48e5..5ecf2c1 100644
--- a/bench/VectorDotProd/VecDot32/main.cpp
+++ b/bench/VectorDotProd/VecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/VectorDotProd/VecDot32_2/bench.py b/bench/VectorDotProd/VecDot32_2/bench.py
index 03a00de..c249222 100755
--- a/bench/VectorDotProd/VecDot32_2/bench.py
+++ b/bench/VectorDotProd/VecDot32_2/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/VectorDotProd/VecDot32_2/main.cpp b/bench/VectorDotProd/VecDot32_2/main.cpp
index 5718396..0365c30 100644
--- a/bench/VectorDotProd/VecDot32_2/main.cpp
+++ b/bench/VectorDotProd/VecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Compression/12bit/bench.py b/bench/XRanBlockFloat/Compression/12bit/bench.py
index c3c3d69..744bd01 100755
--- a/bench/XRanBlockFloat/Compression/12bit/bench.py
+++ b/bench/XRanBlockFloat/Compression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/12bit/main.cpp b/bench/XRanBlockFloat/Compression/12bit/main.cpp
index 7654ce9..ec36a75 100644
--- a/bench/XRanBlockFloat/Compression/12bit/main.cpp
+++ b/bench/XRanBlockFloat/Compression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
   if (argc != 4) {
     // nprbs    - The number of physical resource blocks
     // scale    - Phase compensation term
-    // num_reps - The number of times to repeat the functio
+    // num_reps - The number of times to repeat the function
     fprintf(stderr, "usage: %s nprbs scale nreps\n", argv[0]);
     exit(EXIT_FAILURE);
   }
diff --git a/bench/XRanBlockFloat/Compression/14bit/bench.py b/bench/XRanBlockFloat/Compression/14bit/bench.py
index f3da37b..10f2e16 100755
--- a/bench/XRanBlockFloat/Compression/14bit/bench.py
+++ b/bench/XRanBlockFloat/Compression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/14bit/main.cpp b/bench/XRanBlockFloat/Compression/14bit/main.cpp
index adba31a..eff8698 100644
--- a/bench/XRanBlockFloat/Compression/14bit/main.cpp
+++ b/bench/XRanBlockFloat/Compression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
   if (argc != 4) {
     // nprbs    - The number of physical resource blocks
     // scale    - Phase compensation term
-    // num_reps - The number of times to repeat the functio
+    // num_reps - The number of times to repeat the function
     fprintf(stderr, "usage: %s nprbs scale nreps\n", argv[0]);
     exit(EXIT_FAILURE);
   }
diff --git a/bench/XRanBlockFloat/Compression/8bit/bench.py b/bench/XRanBlockFloat/Compression/8bit/bench.py
index 7f8208d..3e5f2f3 100755
--- a/bench/XRanBlockFloat/Compression/8bit/bench.py
+++ b/bench/XRanBlockFloat/Compression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/8bit/main.cpp b/bench/XRanBlockFloat/Compression/8bit/main.cpp
index eebb335..1aa7d2c 100644
--- a/bench/XRanBlockFloat/Compression/8bit/main.cpp
+++ b/bench/XRanBlockFloat/Compression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
   if (argc != 4) {
     // nprbs    - The number of physical resource blocks
     // scale    - Phase compensation term
-    // num_reps - The number of times to repeat the functio
+    // num_reps - The number of times to repeat the function
     fprintf(stderr, "usage: %s nprbs scale nreps\n", argv[0]);
     exit(EXIT_FAILURE);
   }
diff --git a/bench/XRanBlockFloat/Compression/9bit/bench.py b/bench/XRanBlockFloat/Compression/9bit/bench.py
index eed6825..73391e2 100755
--- a/bench/XRanBlockFloat/Compression/9bit/bench.py
+++ b/bench/XRanBlockFloat/Compression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Compression/9bit/main.cpp b/bench/XRanBlockFloat/Compression/9bit/main.cpp
index 7d9865e..6a96d35 100644
--- a/bench/XRanBlockFloat/Compression/9bit/main.cpp
+++ b/bench/XRanBlockFloat/Compression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -34,7 +34,7 @@ int main(int argc, char **argv) {
   if (argc != 4) {
     // nprbs    - The number of physical resource blocks
     // scale    - Phase compensation term
-    // num_reps - The number of times to repeat the functio
+    // num_reps - The number of times to repeat the function
     fprintf(stderr, "usage: %s nprbs scale nreps\n", argv[0]);
     exit(EXIT_FAILURE);
   }
diff --git a/bench/XRanBlockFloat/Decompression/12bit/bench.py b/bench/XRanBlockFloat/Decompression/12bit/bench.py
index 5f25187..f9ec6f8 100755
--- a/bench/XRanBlockFloat/Decompression/12bit/bench.py
+++ b/bench/XRanBlockFloat/Decompression/12bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/12bit/main.cpp b/bench/XRanBlockFloat/Decompression/12bit/main.cpp
index bd864a4..9816ac1 100644
--- a/bench/XRanBlockFloat/Decompression/12bit/main.cpp
+++ b/bench/XRanBlockFloat/Decompression/12bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/14bit/bench.py b/bench/XRanBlockFloat/Decompression/14bit/bench.py
index bb29e7e..1f08f50 100755
--- a/bench/XRanBlockFloat/Decompression/14bit/bench.py
+++ b/bench/XRanBlockFloat/Decompression/14bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/14bit/main.cpp b/bench/XRanBlockFloat/Decompression/14bit/main.cpp
index 2488e0a..52226a5 100644
--- a/bench/XRanBlockFloat/Decompression/14bit/main.cpp
+++ b/bench/XRanBlockFloat/Decompression/14bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/8bit/bench.py b/bench/XRanBlockFloat/Decompression/8bit/bench.py
index 7c7dc3b..f20eb2b 100755
--- a/bench/XRanBlockFloat/Decompression/8bit/bench.py
+++ b/bench/XRanBlockFloat/Decompression/8bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/8bit/main.cpp b/bench/XRanBlockFloat/Decompression/8bit/main.cpp
index 1e79256..7734d12 100644
--- a/bench/XRanBlockFloat/Decompression/8bit/main.cpp
+++ b/bench/XRanBlockFloat/Decompression/8bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/XRanBlockFloat/Decompression/9bit/bench.py b/bench/XRanBlockFloat/Decompression/9bit/bench.py
index 2a35fd3..5cf57a7 100755
--- a/bench/XRanBlockFloat/Decompression/9bit/bench.py
+++ b/bench/XRanBlockFloat/Decompression/9bit/bench.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 from pathlib import Path
diff --git a/bench/XRanBlockFloat/Decompression/9bit/main.cpp b/bench/XRanBlockFloat/Decompression/9bit/main.cpp
index 0ce037d..1e868ff 100644
--- a/bench/XRanBlockFloat/Decompression/9bit/main.cpp
+++ b/bench/XRanBlockFloat/Decompression/9bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/bench/benchmarker.py b/bench/benchmarker.py
index 17a67ab..d3c2d6a 100755
--- a/bench/benchmarker.py
+++ b/bench/benchmarker.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 
 # This program is for benchmarking the performance of armral functions.
diff --git a/bench/benchmarker_utils.py b/bench/benchmarker_utils.py
index 06a701b..c369eb2 100755
--- a/bench/benchmarker_utils.py
+++ b/bench/benchmarker_utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 import collections
 import os
 import subprocess
diff --git a/bench/default_runner.py b/bench/default_runner.py
index bec13d0..e5cb3ca 100755
--- a/bench/default_runner.py
+++ b/bench/default_runner.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 import json
 import argparse
diff --git a/docs/doxywrapper/arm_footer.html b/docs/doxywrapper/arm_footer.html
index 1851bd0..93fae82 100644
--- a/docs/doxywrapper/arm_footer.html
+++ b/docs/doxywrapper/arm_footer.html
@@ -4,14 +4,14 @@
 <div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
   <ul>
     $navpath
-    <li class="footer">Copyright &copy; 2020-2023 Arm Limited (or its affiliates). All rights reserved.</li>
+    <li class="footer">Copyright &copy; 2020-2024 Arm Limited (or its affiliates). All rights reserved.</li>
   </ul>
 </div>
 <!--END GENERATE_TREEVIEW-->
 <!--BEGIN !GENERATE_TREEVIEW-->
 <hr class="footer"/>
 <address class="footer">
-<small>Copyright &copy; 2020-2023 Arm Limited (or its affiliates). All rights reserved.</small>
+<small>Copyright &copy; 2020-2024 Arm Limited (or its affiliates). All rights reserved.</small>
 </address>
 <!--END !GENERATE_TREEVIEW-->
 </body>
diff --git a/docs/doxywrapper/proprietary_notice.html b/docs/doxywrapper/proprietary_notice.html
index d91418d..931d102 100644
--- a/docs/doxywrapper/proprietary_notice.html
+++ b/docs/doxywrapper/proprietary_notice.html
@@ -47,7 +47,7 @@ document may be the trademarks of their respective owners. Please follow Arm's
 trademark usage guidelines at
 https://www.arm.com/company/policies/trademarks.</p>
 
-<p>Copyright &copy; 2020-2023 Arm Limited (or its affiliates). All rights reserved.<br/>
+<p>Copyright &copy; 2020-2024 Arm Limited (or its affiliates). All rights reserved.<br/>
 Arm Limited. Company 02557590 registered in England.<br/>
 110 Fulbourn Road, Cambridge, England CB1 9NJ.<br/>
 (LES-PRE-20349)</p>
diff --git a/docs/examples.md b/docs/examples.md
index 04db467..ebfeff5 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -17,9 +17,9 @@ Acceleration Library (ArmRAL).
 
   To build the library, use:
 
-      tar zxvf ral-armral-23.10.tar.gz
-      mkdir ral-armral-23.10/build
-      cd ral-armral-23.10/build
+      tar zxvf ral-armral-24.01.tar.gz
+      mkdir ral-armral-24.01/build
+      cd ral-armral-24.01/build
       cmake ..
       make -j
 
diff --git a/docs/frontmatter.md b/docs/frontmatter.md
index aa4cfdd..c5fd9bf 100644
--- a/docs/frontmatter.md
+++ b/docs/frontmatter.md
@@ -1,6 +1,6 @@
 # Arm RAN Acceleration Library (ArmRAL) Reference Guide
 
-Copyright © 2020-2023 Arm Limited (or its affiliates). All rights reserved.
+Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
 # About this book
 
@@ -39,7 +39,7 @@ supplier and give:
 If you have any comments on content, send an e-mail to errata@arm.com. Give:
 
 * The title Arm RAN Acceleration Library Reference Guide.
-* The number 102249_2310_00_en.
+* The number 102249_2401_00_en.
 * If applicable, the relevant page number(s) to which your comments refer.
 * A concise explanation of your comments.
 
@@ -95,7 +95,7 @@ rights reserved. Other brands and names mentioned in this document may be the
 trademarks of their respective owners. Please follow Arm's trademark usage
 guidelines at https://www.arm.com/company/policies/trademarks.
 
-Copyright © 2020-2023 Arm Limited (or its affiliates). All rights reserved.
+Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
 
 Arm Limited. Company 02557590 registered in England.
 
@@ -148,3 +148,4 @@ Issue   | Date            | Confidentiality  | Change
 2304-00 | 21 April 2023   | Non-Confidential | Update for Arm RAN Acceleration Library v23.04
 2307-00 | 07 July 2023    | Non-Confidential | Update for Arm RAN Acceleration Library v23.07
 2310-00 | 06 October 2023 | Non-Confidential | Update for Arm RAN Acceleration Library v23.10
+2401-00 | 19 January 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.01
diff --git a/examples/block_float_9b_example.c b/examples/block_float_9b_example.c
index 8cab3fa..8abe390 100644
--- a/examples/block_float_9b_example.c
+++ b/examples/block_float_9b_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/fft_cf32_example.c b/examples/fft_cf32_example.c
index 8a29db7..690d876 100644
--- a/examples/fft_cf32_example.c
+++ b/examples/fft_cf32_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/modulation_example.c b/examples/modulation_example.c
index 791ef4c..3ee95d6 100644
--- a/examples/modulation_example.c
+++ b/examples/modulation_example.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/examples/polar_example.cpp b/examples/polar_example.cpp
index ca0c50e..5648f30 100644
--- a/examples/polar_example.cpp
+++ b/examples/polar_example.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/include/.clang-tidy b/include/.clang-tidy
index 3d1092e..d8f91b4 100644
--- a/include/.clang-tidy
+++ b/include/.clang-tidy
@@ -2,7 +2,6 @@
 Checks: ''
 WarningsAsErrors: ''
 HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
 FormatStyle:     file
 InheritParentConfig: true
 CheckOptions:
diff --git a/include/armral.h b/include/armral.h
index 7e77bf5..2f247eb 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -1626,23 +1626,35 @@ armral_cmplx_mat_inverse_batch_f32_pa(uint32_t num_mats, uint32_t size,
  */
 /**
  * Computes the regularized pseudo-inverse of a single matrix. The `N-by-M`
- * regularized pseudo-inverse `C` of an `M-by-N` matrix `A` with `M <= N` is
- * defined as:
+ * regularized pseudo-inverse `C` of an `M-by-N` matrix `A` is defined as:
  *
  * <pre>
  *   C = A^H * (A * A^H + λ * I)^-1
  * </pre>
  *
+ * for `M <= N`, and is defined as:
+ *
+ * <pre>
+ *   C = (A^H * A + λ * I)^-1 * A^H
+ * </pre>
+ *
+ * for `M > N`.
+ *
  * This function performs numerical matrix inversion using the Schur complement
- * to compute the regularized pseudo-inverse of `A` directly from this
- * expression.
+ * to compute the regularized pseudo-inverse of `A` directly from the
+ * appropriate expression.
  *
  * \warning This method is numerically unstable for matrices that are not very
  * well conditioned.
  *
  * The input matrix `p_src` and output matrix `p_dst` are stored contiguously
- * in memory, in row-major order. The number of rows `m` in the input matrix
- * must be 2, 3, 4, 8 or 16 and `m <= n`.
+ * in memory, in row-major order.
+ *
+ * \note
+ * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * 8 or 16.
+ * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * 4, 8 or 16.
  *
  * @param[in]  m         The number of rows in input matrix `A`.
  * @param[in]  n         The number of columns in input matrix `A`.
@@ -1660,29 +1672,45 @@ armral_cmplx_pseudo_inverse_direct_f32(uint16_t m, uint16_t n, float32_t lambda,
  * Non-allocating variant of \link armral_cmplx_pseudo_inverse_direct_f32
  * \endlink.
  *
- * This function computes the regularized pseudo-inverse of a single matrix.
- * The `N-by-M` regularized pseudo-inverse `C` of an `M-by-N` matrix `A`
- * with `M <= N` is defined as:
+ * Computes the regularized pseudo-inverse of a single matrix. The `N-by-M`
+ * regularized pseudo-inverse `C` of an `M-by-N` matrix `A` is defined as:
  *
  * <pre>
  *   C = A^H * (A * A^H + λ * I)^-1
  * </pre>
  *
+ * for `M <= N`, and is defined as:
+ *
+ * <pre>
+ *   C = (A^H * A + λ * I)^-1 * A^H
+ * </pre>
+ *
+ * for `M > N`.
+ *
  * This function performs numerical matrix inversion using the Schur complement
- * to compute the regularized pseudo-inverse of `A` directly from this
- * expression.
+ * to compute the regularized pseudo-inverse of `A` directly from the
+ * appropriate expression.
  *
  * \warning This method is numerically unstable for matrices that are not very
  * well conditioned.
  *
  * The input matrix `p_src` and output matrix `p_dst` are stored contiguously
- * in memory, in row-major order. The number of rows `m` in the input matrix
- * must be 2, 3, 4, 8 or 16 and `m <= n`.
+ * in memory, in row-major order.
+ *
+ * \note
+ * - If `m <= n` the number of rows `m` in the input matrix must be 2, 3, 4,
+ * 8 or 16.
+ * - If `m > n` the number of columns `n` in the input matrix must be 2, 3,
+ * 4, 8 or 16.
  *
  * This function takes a pre-allocated buffer (`buffer`) to use internally.
  * This variant will not call any system memory allocators.
  *
- * The buffer must be at least `m * m * sizeof(armral_cmplx_f32_t) + 3` bytes.
+ * \note
+ * - If `m <= n` the buffer must be at least
+ * `m * m * sizeof(armral_cmplx_f32_t) + 3` bytes.
+ * - If `m > n` the buffer must be at least
+ * `n * n * sizeof(armral_cmplx_f32_t) + 3` bytes.
  *
  * @param[in]  m         The number of rows in input matrix `A`.
  * @param[in]  n         The number of columns in input matrix `A`.
@@ -2557,7 +2585,7 @@ armral_status armral_crc16_le(uint32_t size, const uint64_t *input,
  *
  * @param[in]     size      The number of bytes of the given buffer.
  * @param[in]     input     Points to the input byte sequence.
- * @param[out]    crc16     The computed CRC on 16 bit.
+ * @param[out]    crc16     The computed 16-bit CRC result.
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_crc16_be(uint32_t size, const uint64_t *input,
@@ -2581,7 +2609,7 @@ armral_status armral_crc11_le(uint32_t size, const uint64_t *input,
  *
  * @param[in]     size      The number of bytes of the given buffer.
  * @param[in]     input     Points to the input byte sequence.
- * @param[out]    crc11     The computed CRC on 11 bit.
+ * @param[out]    crc11     The computed 11-bit CRC result.
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_crc11_be(uint32_t size, const uint64_t *input,
@@ -2605,7 +2633,7 @@ armral_status armral_crc6_le(uint32_t size, const uint64_t *input,
  *
  * @param[in]     size      The number of bytes of the given buffer.
  * @param[in]     input     Points to the input byte sequence.
- * @param[out]    crc6      The computed CRC on 6 bit.
+ * @param[out]    crc6      The computed 6-bit CRC result.
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_crc6_be(uint32_t size, const uint64_t *input,
@@ -2674,9 +2702,12 @@ armral_status armral_crc6_be(uint32_t size, const uint64_t *input,
  * @param[in]  n_pc     The number of parity bits in the encoded message.
  * @param[in]  n_pc_wm  The number of row-weight-selected parity bits in the
  *                      encoded message. Must be either zero or one.
- * @param[out] frozen   The output `frozen` mask, length `n` bytes. Elements
- *                      corresponding to `frozen` bits are set to all ones,
- *                      everything else set to zero.
+ * @param[out] frozen   The output `frozen` mask, length `n` bytes. As described
+ *                      by `armral_polar_frozen_bit_type`, elements
+ *                      corresponding to `frozen` bits are set to `0xFF`,
+ *                      elements corresponding to parity bits are set to `0x01`,
+ *                      and elements corresponding to information bits are set
+ *                      to `0x00`.
  * @return     An `armral_status` value that indicates success or failure.
  */
 armral_status armral_polar_frozen_mask(uint32_t n, uint32_t e, uint32_t k,
@@ -2782,8 +2813,8 @@ armral_status armral_polar_decode_block(uint32_t n, const uint8_t *frozen,
 
 /**
  * Matches the rate of the Polar encoded code block to the rate of the channel
- * using sub-block interleaving, bit selection, and channel interleaving based on
- * Downlink or Uplink direction. This is as described in the 3GPP Technical
+ * using sub-block interleaving, bit selection, and channel interleaving based
+ * on Downlink or Uplink direction. This is as described in the 3GPP Technical
  * Specification (TS) 38.212 section 5.4.1.
  *
  * The code rate of the code block is defined by the ratio of the rate-matched
@@ -2795,7 +2826,8 @@ armral_status armral_polar_decode_block(uint32_t n, const uint8_t *frozen,
  * @param[in]  n           The number of bits in the code block.
  * @param[in]  e           The number of bits in the rate-matched message.
  * @param[in]  k           The number of information bits in the code block.
- * @param[in]  i_bil       Flag to enable/disable the interleaving of coded bits.
+ * @param[in]  i_bil       Flag to enable/disable the interleaving of coded
+ *                         bits.
  * @param[in]  p_d_seq_in  Points to `n` bits representing the Polar encoded
  *                         message.
  * @param[out] p_f_seq_out Points to `e` bits representing the rate-matched
@@ -2810,10 +2842,10 @@ armral_status armral_polar_rate_matching(uint32_t n, uint32_t e, uint32_t k,
 /**
  * Non-allocating variant of \link armral_polar_rate_matching \endlink.
  *
- * This function matches the rate of the Polar encoded code block to the rate
- * of the channel using sub-block interleaving, bit selection, and channel
- * interleaving. This is as described in the 3GPP Technical Specification (TS)
- * 38.212 section 5.4.1.
+ * Matches the rate of the Polar encoded code block to the rate of the channel
+ * using sub-block interleaving, bit selection, and channel interleaving based
+ * on Downlink or Uplink direction. This is as described in the 3GPP Technical
+ * Specification (TS) 38.212 section 5.4.1.
  *
  * The code rate of the code block is defined by the ratio of the rate-matched
  * length `e` to the number of information bits in the message `k`. It is
@@ -2832,7 +2864,8 @@ armral_status armral_polar_rate_matching(uint32_t n, uint32_t e, uint32_t k,
  * @param[in]  n           The number of bits in the code block.
  * @param[in]  e           The number of bits in the rate-matched message.
  * @param[in]  k           The number of information bits in the code block.
- * @param[in]  i_bil       Flag to enable/disable the interleaving of coded bits.
+ * @param[in]  i_bil       Flag to enable/disable the interleaving of coded
+ *                         bits.
  * @param[in]  p_d_seq_in  Points to `n` bits representing the Polar encoded
  *                         message.
  * @param[out] p_f_seq_out Points to `e` bits representing the rate-matched
@@ -3541,7 +3574,7 @@ uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
  *                            section 5.2.2, filler bits insertion is needed to
  *                            ensure that the code block segments have a valid
  *                            length and are a multiple of the lifting size.
- * @param[in]  k              codeblock size, the number of bits to encode as
+ * @param[in]  k              Codeblock size, the number of bits to encode as
  *                            per section 5.3.2 of TS 38.212.
  * @param[in]  rv             Redundancy version used in rate matching. Must be
  *                            in the set `{0, 1, 2, 3}`. The effect of choosing
@@ -3603,7 +3636,7 @@ armral_status armral_ldpc_rate_matching(armral_ldpc_graph_t bg, uint32_t z,
  *                            section 5.2.2, filler bits insertion is needed to
  *                            ensure that the code block segments have a valid
  *                            length and are a multiple of the lifting size.
- * @param[in]  k              codeblock size, the number of bits to encode as
+ * @param[in]  k              Codeblock size, the number of bits to encode as
  *                            per section 5.3.2 of TS 38.212.
  * @param[in]  rv             Redundancy version used in rate matching. Must be
  *                            in the set `{0, 1, 2, 3}`. The effect of choosing
@@ -3666,7 +3699,7 @@ armral_status armral_ldpc_rate_matching_noalloc(
  *                             section 5.2.2, filler bits insertion is needed to
  *                             ensure that the code block segments have a valid
  *                             length and are a multiple of the lifting size.
- * @param[in]     k            codeblock size, the number of bits to encode as
+ * @param[in]     k            Codeblock size, the number of bits to encode as
  *                             per section 5.3.2 of TS 38.212.
  * @param[in]     rv           Redundancy version used in rate matching. Must be
  *                             in the set `{0, 1, 2, 3}`. The effect of choosing
@@ -3737,7 +3770,7 @@ armral_status armral_ldpc_rate_recovery(armral_ldpc_graph_t bg, uint32_t z,
  *                            section 5.2.2, filler bits insertion is needed to
  *                            ensure that the code block segments have a valid
  *                            length and are a multiple of the lifting size.
- * @param[in]     k           codeblock size, the number of bits to encode as
+ * @param[in]     k           Codeblock size, the number of bits to encode as
  *                            per section 5.3.2 of TS 38.212.
  * @param[in]     rv          Redundancy version used in rate matching. Must be
  *                            in the set `{0, 1, 2, 3}`. The effect of choosing
diff --git a/license_terms/BSD-3-Clause.txt b/license_terms/BSD-3-Clause.txt
index b225ef5..10ce6d4 100644
--- a/license_terms/BSD-3-Clause.txt
+++ b/license_terms/BSD-3-Clause.txt
@@ -1,4 +1,4 @@
-Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/simulation/README.md b/simulation/README.md
index 8540d80..77829a4 100644
--- a/simulation/README.md
+++ b/simulation/README.md
@@ -260,7 +260,7 @@ The JSON record contains the following fields:
         "Eb/N0": <eb_n0>,
         "snr": <snr>,
         "ulp": <demod_ulp>,
-        "len_filler_bits":<len_filler_bits>
+        "len_filler_bits": <len_filler_bits>,
         "bler": <bler>,
         "ber": <ber>
       }
diff --git a/simulation/awgn/awgn.cpp b/simulation/awgn/awgn.cpp
index 8e41e48..6ae035c 100644
--- a/simulation/awgn/awgn.cpp
+++ b/simulation/awgn/awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "awgn.h"
 #include "rng.hpp"
diff --git a/simulation/awgn/awgn.h b/simulation/awgn/awgn.h
index a49a14b..ad8c8fe 100644
--- a/simulation/awgn/awgn.h
+++ b/simulation/awgn/awgn.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/simulation/capacity/capacity.py b/simulation/capacity/capacity.py
index b7801b7..4b331ad 100755
--- a/simulation/capacity/capacity.py
+++ b/simulation/capacity/capacity.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from math import sqrt, exp, pi, log
diff --git a/simulation/convolutional_awgn/CMakeLists.txt b/simulation/convolutional_awgn/CMakeLists.txt
index c074e63..14faf79 100644
--- a/simulation/convolutional_awgn/CMakeLists.txt
+++ b/simulation/convolutional_awgn/CMakeLists.txt
@@ -30,5 +30,6 @@ if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
   # a set of valid inputs. We do not check the validity of the output.
   # We also only run this if we are not using a test running wrapper.
   add_test(NAME convolutional_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/convolutional_awgn -k 8 -m 0 -u 128)
+  set_tests_properties(convolutional_awgn PROPERTIES TIMEOUT 3000)
   add_dependencies(check convolutional_awgn)
 endif()
diff --git a/simulation/convolutional_awgn/convolutional_awgn.cpp b/simulation/convolutional_awgn/convolutional_awgn.cpp
index ee13270..294b5b3 100644
--- a/simulation/convolutional_awgn/convolutional_awgn.cpp
+++ b/simulation/convolutional_awgn/convolutional_awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "awgn.h"
diff --git a/simulation/convolutional_awgn/convolutional_error_rate.py b/simulation/convolutional_awgn/convolutional_error_rate.py
index e3f9ba5..1bd7e71 100755
--- a/simulation/convolutional_awgn/convolutional_error_rate.py
+++ b/simulation/convolutional_awgn/convolutional_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/include/simulation_common.hpp b/simulation/include/simulation_common.hpp
index 927ddda..ecced32 100644
--- a/simulation/include/simulation_common.hpp
+++ b/simulation/include/simulation_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/simulation/include/simulation_common.py b/simulation/include/simulation_common.py
index 036990b..f062f40 100755
--- a/simulation/include/simulation_common.py
+++ b/simulation/include/simulation_common.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from dataclasses import dataclass
 from datetime import datetime
diff --git a/simulation/ldpc_awgn/CMakeLists.txt b/simulation/ldpc_awgn/CMakeLists.txt
index 637ab3c..bfba143 100644
--- a/simulation/ldpc_awgn/CMakeLists.txt
+++ b/simulation/ldpc_awgn/CMakeLists.txt
@@ -30,5 +30,6 @@ if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
   # a set of valid inputs. We do not check the validity of the output.
   # We also only run this if we are not using a test running wrapper.
   add_test(NAME ldpc_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ldpc_awgn -z 3 -b 1 -m 0 -r 0 -u 128)
+  set_tests_properties(ldpc_awgn PROPERTIES TIMEOUT 3000)
   add_dependencies(check ldpc_awgn)
 endif()
diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp
index 5932488..9df77ae 100644
--- a/simulation/ldpc_awgn/ldpc_awgn.cpp
+++ b/simulation/ldpc_awgn/ldpc_awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "awgn.h"
@@ -102,9 +102,9 @@ void usage(const char *exe_name) {
       << "                       the symbol amplitudes are multiplied by a\n"
       << "                       scaling factor of 0x1p15/<demod_ulp>.\n"
       << "                       Default value is 128.\n"
-      << "  <len_filler_bits>    Filler bits length used to simulate case\n"
-      << "                       where transport block length is not multiple\n"
-      << "                       of Lifting size .\n"
+      << "  <len_filler_bits>    Number of filler bits to use when simulating\n"
+      << "                       cases where the transport block length is\n"
+      << "                       not a multiple of the lifting size.\n"
       << "                       Default length is 0.\n"
       << std::endl;
 }
@@ -279,7 +279,7 @@ struct sim_result {
     s << "{\"n\": " << n << ", \"bg\": " << bg << ", \"mod_type\": \""
       << mod_type << "\", \"rv\": " << rv << ", \"Eb/N0\": " << ebn0
       << ", \"snr\": " << snr << ", \"ulp\": " << ulp
-      << ",  \"len_filler_bits\": " << len_filler_bits << ",\"bler\": " << bler
+      << ", \"len_filler_bits\": " << len_filler_bits << ",\"bler\": " << bler
       << ", \"ber\": " << ber << "}";
     return std::move(s).str();
   }
diff --git a/simulation/ldpc_awgn/ldpc_error_rate.py b/simulation/ldpc_awgn/ldpc_error_rate.py
index ac7a476..32e75e3 100755
--- a/simulation/ldpc_awgn/ldpc_error_rate.py
+++ b/simulation/ldpc_awgn/ldpc_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/modulation_awgn/CMakeLists.txt b/simulation/modulation_awgn/CMakeLists.txt
index eecf402..c30886a 100644
--- a/simulation/modulation_awgn/CMakeLists.txt
+++ b/simulation/modulation_awgn/CMakeLists.txt
@@ -30,5 +30,6 @@ if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
   # a set of valid inputs. We do not check the validity of the output.
   # We also only run this if we are not using a test running wrapper.
   add_test(NAME modulation_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/modulation_awgn -k 32 -m 0 -u 128)
+  set_tests_properties(modulation_awgn PROPERTIES TIMEOUT 3000)
   add_dependencies(check modulation_awgn)
 endif()
diff --git a/simulation/modulation_awgn/modulation_awgn.cpp b/simulation/modulation_awgn/modulation_awgn.cpp
index 32ea7b9..426c90e 100644
--- a/simulation/modulation_awgn/modulation_awgn.cpp
+++ b/simulation/modulation_awgn/modulation_awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "awgn.h"
 #include "bit_utils.hpp"
diff --git a/simulation/modulation_awgn/modulation_error_rate.py b/simulation/modulation_awgn/modulation_error_rate.py
index 53628cb..c5b72ea 100755
--- a/simulation/modulation_awgn/modulation_error_rate.py
+++ b/simulation/modulation_awgn/modulation_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 import pandas as pd
diff --git a/simulation/polar_awgn/CMakeLists.txt b/simulation/polar_awgn/CMakeLists.txt
index 52ef1cc..249b4bc 100644
--- a/simulation/polar_awgn/CMakeLists.txt
+++ b/simulation/polar_awgn/CMakeLists.txt
@@ -30,5 +30,6 @@ if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
   # a set of valid inputs. We do not check the validity of the output.
   # We also only run this if we are not using a test running wrapper.
   add_test(NAME polar_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/polar_awgn -k 32 -e 32 -l 1 -m 0 -i 0 -u 128)
+  set_tests_properties(polar_awgn PROPERTIES TIMEOUT 3000)
   add_dependencies(check polar_awgn)
 endif()
diff --git a/simulation/polar_awgn/polar_awgn.cpp b/simulation/polar_awgn/polar_awgn.cpp
index 704963d..76241f0 100644
--- a/simulation/polar_awgn/polar_awgn.cpp
+++ b/simulation/polar_awgn/polar_awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "awgn.h"
@@ -105,7 +105,6 @@ struct polar_example_data {
   uint32_t len_out;
   uint32_t bits_per_mod_symbol;
   uint32_t num_mod_symbols;
-  uint32_t num_mod_symbols_matched;
   uint8_t *data_in;
   uint8_t *frozen_mask;
   uint8_t *data_interleave;
diff --git a/simulation/polar_awgn/polar_error_rate.py b/simulation/polar_awgn/polar_error_rate.py
index fd00161..5cd4234 100755
--- a/simulation/polar_awgn/polar_error_rate.py
+++ b/simulation/polar_awgn/polar_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/simulation/turbo_awgn/CMakeLists.txt b/simulation/turbo_awgn/CMakeLists.txt
index afdc6a4..0f6389a 100644
--- a/simulation/turbo_awgn/CMakeLists.txt
+++ b/simulation/turbo_awgn/CMakeLists.txt
@@ -30,5 +30,6 @@ if(BUILD_TESTING AND NOT DEFINED ARMRAL_TEST_RUNNER)
   # a set of valid inputs. We do not check the validity of the output.
   # We also only run this if we are not using a test running wrapper.
   add_test(NAME turbo_awgn COMMAND ${CMAKE_CURRENT_BINARY_DIR}/turbo_awgn -k 40 -m 0 -i 1 -e 60)
+  set_tests_properties(turbo_awgn PROPERTIES TIMEOUT 3000)
   add_dependencies(check turbo_awgn)
 endif()
diff --git a/simulation/turbo_awgn/turbo_awgn.cpp b/simulation/turbo_awgn/turbo_awgn.cpp
index d751f22..29c8bdb 100644
--- a/simulation/turbo_awgn/turbo_awgn.cpp
+++ b/simulation/turbo_awgn/turbo_awgn.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "awgn.h"
diff --git a/simulation/turbo_awgn/turbo_error_rate.py b/simulation/turbo_awgn/turbo_error_rate.py
index 3351129..6725cdf 100755
--- a/simulation/turbo_awgn/turbo_error_rate.py
+++ b/simulation/turbo_awgn/turbo_error_rate.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # Arm RAN Acceleration Library
-# Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 from argparse import ArgumentParser
 from dataclasses import dataclass
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
index 14992ba..5c889e1 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
index 6246a8a..fccc80a 100644
--- a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
+++ b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
index c3d0d68..cb47a3e 100644
--- a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
+++ b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace armral::cmplx_herm_mat_inv {
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
index 45a05c7..29d1683 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_aah_f32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
index 81907e8..ddad389 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_ahb_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
index 1179efd..4712548 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
index 9ebad6c..f1b3341 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
index 7a7863e..fdfe709 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_mult_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
index e0eeffe..631c0b1 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
index 13317c0..1f2ba56 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
index 998abdf..5f0c082 100644
--- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
+++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c
@@ -1,17 +1,12 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
 
 #include <assert.h>
 
-typedef struct {
-  int32_t re;
-  int32_t im;
-} cmplx_int32_t;
-
 static int16x4_t vld1s_s16(const armral_cmplx_int16_t *p) {
   // there is no intrinsic for only loading 32-bits into an ACLE vector.
   int16x4_t ret;
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
index 18519b7..6717b10 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_convert.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
index 18c807e..b0f9c8a 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
index f5f7761..56bd0d8 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_4sc.h"
 #include "arm_solve_convert.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
index 7bb8a6f..f2963b8 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
index b7ff8cc..edadced 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_6sc.h"
 #include "arm_solve_convert.h"
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
index cd4d641..52bcf9d 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_convert.h b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
index 3352d75..acc8842 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_convert.h
+++ b/src/BasicMathFun/MatrixMult/arm_solve_convert.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/BasicMathFun/MatrixMult/arm_solve_f32.c b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
index 3afd242..75dfecd 100644
--- a/src/BasicMathFun/MatrixMult/arm_solve_f32.c
+++ b/src/BasicMathFun/MatrixMult/arm_solve_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "arm_solve_1sc.h"
 #include "arm_solve_4sc.h"
diff --git a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
index 657c463..999db43 100644
--- a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
+++ b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp
@@ -1,29 +1,53 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
 
 #include "../MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp"
+#include "cmplx_mat_pseudo_inverse.hpp"
 
 #include <cstdlib>
 
 namespace {
 
+template<uint16_t n, typename Allocator>
+void left_pseudo_inverse(uint16_t m, const float32_t lambda,
+                         const armral_cmplx_f32_t *__restrict p_src,
+                         armral_cmplx_f32_t *p_dst, Allocator &allocator) {
+
+  // Compute C = A^H * A
+  // We can use p_dst as an intermediate N-by-N array since it has size N-by-M,
+  // and N < M
+  auto *mat_aha = p_dst;
+  armral_cmplx_mat_mult_ahb_f32(m, n, n, p_src, p_src, mat_aha);
+
+  // Compute C += lambda * I
+  armral::cmplx_mat_pseudo_inv::add_lambda<n>(lambda, p_dst);
+
+  // Compute B = C^(-1)
+  auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, n * n);
+  armral::cmplx_herm_mat_inv::invert_hermitian_matrix<n>(mat_aha,
+                                                         mat_inv.get());
+
+  // Compute B * A^H
+  armral::cmplx_mat_pseudo_inv::mat_mult_bah_f32(m, n, p_src, mat_inv.get(),
+                                                 p_dst);
+}
+
 template<uint16_t m, typename Allocator>
-void pseudo_inverse(uint16_t n, const float32_t lambda,
-                    const armral_cmplx_f32_t *__restrict p_src,
-                    armral_cmplx_f32_t *p_dst, Allocator &allocator) {
+void right_pseudo_inverse(uint16_t n, const float32_t lambda,
+                          const armral_cmplx_f32_t *__restrict p_src,
+                          armral_cmplx_f32_t *p_dst, Allocator &allocator) {
   // Compute C = A * A^H
   // We can use p_dst as an intermediate M-by-M array since it has size N-by-M,
   // and N >= M
   auto *mat_aah = p_dst;
   armral_cmplx_mat_mult_aah_f32(m, n, p_src, mat_aah);
 
-  for (uint16_t i = 0; i < m; i++) {
-    mat_aah[i * (m + 1)].re += lambda;
-  }
+  // Compute C += lambda * I
+  armral::cmplx_mat_pseudo_inv::add_lambda<m>(lambda, mat_aah);
 
   // Compute B = C^(-1)
   auto mat_inv = allocate_uninitialized<armral_cmplx_f32_t>(allocator, m * m);
@@ -40,35 +64,63 @@ cmplx_pseudo_inverse_direct(uint16_t m, uint16_t n, const float32_t lambda,
                             const armral_cmplx_f32_t *__restrict p_src,
                             armral_cmplx_f32_t *p_dst, Allocator &allocator) {
 
-  // The number of rows must be less than or equal to the number of columns to
-  // allow A * A^H to be invertible
-  if (m > n) {
-    return ARMRAL_ARGUMENT_ERROR;
-  }
-
   // This routine uses the Hermitian matrix inversion routine defined in the
   // library (armral_cmplx_hermitian_max_inverse_f32) which is only valid for
   // particular matrix sizes. This places a restriction on the number of rows
   // that the input matrix A can have here.
+
+  // If the number of rows in the input matrix is larger than the number of
+  // columns then use the left pseudo-inverse
+  if (m > n) {
+    switch (n) {
+    case 2: {
+      left_pseudo_inverse<2>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
+    case 3: {
+      left_pseudo_inverse<3>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
+    case 4: {
+      left_pseudo_inverse<4>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
+    case 8: {
+      left_pseudo_inverse<8>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
+    case 16: {
+      left_pseudo_inverse<16>(m, lambda, p_src, p_dst, allocator);
+      break;
+    }
+    default:
+      return ARMRAL_ARGUMENT_ERROR;
+    }
+
+    return ARMRAL_SUCCESS;
+  }
+
+  // If the number of rows in the input matrix is less than or equal to the number
+  // of columns then use the right pseudo-inverse
   switch (m) {
   case 2: {
-    pseudo_inverse<2>(n, lambda, p_src, p_dst, allocator);
+    right_pseudo_inverse<2>(n, lambda, p_src, p_dst, allocator);
     break;
   }
   case 3: {
-    pseudo_inverse<3>(n, lambda, p_src, p_dst, allocator);
+    right_pseudo_inverse<3>(n, lambda, p_src, p_dst, allocator);
     break;
   }
   case 4: {
-    pseudo_inverse<4>(n, lambda, p_src, p_dst, allocator);
+    right_pseudo_inverse<4>(n, lambda, p_src, p_dst, allocator);
     break;
   }
   case 8: {
-    pseudo_inverse<8>(n, lambda, p_src, p_dst, allocator);
+    right_pseudo_inverse<8>(n, lambda, p_src, p_dst, allocator);
     break;
   }
   case 16: {
-    pseudo_inverse<16>(n, lambda, p_src, p_dst, allocator);
+    right_pseudo_inverse<16>(n, lambda, p_src, p_dst, allocator);
     break;
   }
   default:
diff --git a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
new file mode 100644
index 0000000..04525b9
--- /dev/null
+++ b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp
@@ -0,0 +1,69 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+
+namespace armral::cmplx_mat_pseudo_inv {
+
+void mat_mult_bah_f32(uint16_t m, uint16_t n,
+                      const armral_cmplx_f32_t *__restrict p_src_a,
+                      const armral_cmplx_f32_t *__restrict p_src_b,
+                      armral_cmplx_f32_t *p_dst) {
+  // For input matrices A and B, computes B * A^H
+  for (uint16_t i = 0; i < n; i++) {
+    for (uint16_t j = 0; j < m; j++) {
+      float32_t re = 0.0;
+      float32_t im = 0.0;
+      float32x4x2_t p_out = {{vdupq_n_f32(0.F), vdupq_n_f32(0.F)}};
+
+      uint16_t k = 0;
+      for (; k + 3 < n; k += 4) {
+        uint32_t b_idx = i * n + k;
+        uint32_t ah_idx = j * n + k;
+
+        float32x4_t p_in_b[] = {
+            vld1q_f32((const float32_t *)&p_src_b[b_idx]),
+            vld1q_f32((const float32_t *)&p_src_b[b_idx + 2])};
+        float32x4_t p_in_ah[] = {
+            vld1q_f32((const float32_t *)&p_src_a[ah_idx]),
+            vld1q_f32((const float32_t *)&p_src_a[ah_idx + 2])};
+
+        // c.re = a.re * ah.re + a.im * ah.im
+        p_out.val[0] = vfmaq_f32(p_out.val[0], p_in_b[0], p_in_ah[0]);
+        p_out.val[0] = vfmaq_f32(p_out.val[0], p_in_b[1], p_in_ah[1]);
+
+        // c.im = a.im * ah.re - a.re * ah.im
+        p_in_ah[0] = vrev64q_f32(p_in_ah[0]);
+        p_in_ah[1] = vrev64q_f32(p_in_ah[1]);
+        p_out.val[1] = vfmsq_f32(p_out.val[1], p_in_b[0], p_in_ah[0]);
+        p_out.val[1] = vfmsq_f32(p_out.val[1], p_in_b[1], p_in_ah[1]);
+      }
+      re = vaddvq_f32(p_out.val[0]);
+      p_out.val[1] =
+          vreinterpretq_f32_f64(vnegq_f64(vreinterpretq_f64_f32(p_out.val[1])));
+      im = vaddvq_f32(p_out.val[1]);
+
+      if (n % 4 != 0) {
+        for (; k < n; k++) {
+          uint32_t b_idx = i * n + k;
+          uint32_t ah_idx = j * n + k;
+          re += p_src_b[b_idx].re * p_src_a[ah_idx].re +
+                p_src_b[b_idx].im * p_src_a[ah_idx].im;
+          im += p_src_b[b_idx].im * p_src_a[ah_idx].re -
+                p_src_b[b_idx].re * p_src_a[ah_idx].im;
+        }
+      }
+      p_dst[i * m + j] = armral_cmplx_f32_t{re, im};
+    }
+  }
+}
+
+template<uint16_t dim>
+void add_lambda(float32_t lambda, armral_cmplx_f32_t *p_dst) {
+  // Adds lambda to the diagonals of a dim-by-dim matrix
+  for (uint16_t i = 0; i < dim; i++) {
+    p_dst[i * (dim + 1)].re += lambda;
+  }
+}
+
+} // namespace armral::cmplx_mat_pseudo_inv
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
index ba61e98..9813462 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
index 129fed3..4549d4a 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
@@ -50,12 +50,12 @@ armral_status armral_cmplx_vecdot_f32_2(int32_t n,
     svfloat32_t vec_b_imag = svld1_f32(tail_pg, p_src_b_im);
 
     /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
-    acc_real = svmla_f32_x(tail_pg, acc_real, vec_a_real, vec_b_real);
-    acc_real = svmls_f32_x(tail_pg, acc_real, vec_a_imag, vec_b_imag);
+    acc_real = svmla_f32_m(tail_pg, acc_real, vec_a_real, vec_b_real);
+    acc_real = svmls_f32_m(tail_pg, acc_real, vec_a_imag, vec_b_imag);
 
     /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
-    acc_imag = svmla_f32_x(tail_pg, acc_imag, vec_a_real, vec_b_imag);
-    acc_imag = svmla_f32_x(tail_pg, acc_imag, vec_a_imag, vec_b_real);
+    acc_imag = svmla_f32_m(tail_pg, acc_imag, vec_a_real, vec_b_imag);
+    acc_imag = svmla_f32_m(tail_pg, acc_imag, vec_a_imag, vec_b_real);
   }
 
   *p_src_c_re = svaddv_f32(pg, acc_real);
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
index 29629a0..82ff5c4 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
index cb03f06..36d4043 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
index 7957e46..1bceca5 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
index 1d3d9a2..9a40c00 100644
--- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
+++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
index 613ffd4..a6660f2 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
index da4a576..ac068af 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
index 9cce4ad..617ddba 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
index 05fef2f..322e0cd 100644
--- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
+++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
index 53d28e5..34e0c2d 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #ifdef ARMRAL_ARCH_SVE
diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
index 3bfaa21..582de2f 100644
--- a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
+++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/vec_mul.hpp"
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
index ea7c8e5..310c017 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
index 31242b2..7d41c34 100644
--- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
index c458999..7d96bf4 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #if ARMRAL_ARCH_SVE >= 2
diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
index 403d356..c8fb82a 100644
--- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
+++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/DuRuInterface/bit_packing_common.hpp b/src/DuRuInterface/bit_packing_common.hpp
index be0b4c8..2c7af3b 100644
--- a/src/DuRuInterface/bit_packing_common.hpp
+++ b/src/DuRuInterface/bit_packing_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/Correlation/arm_correlation.c b/src/LowerPHY/Correlation/arm_correlation.c
index 8fee9b7..71dce49 100644
--- a/src/LowerPHY/Correlation/arm_correlation.c
+++ b/src/LowerPHY/Correlation/arm_correlation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp
index 58d2850..830bb02 100644
--- a/src/LowerPHY/FFT/fft_cf32.cpp
+++ b/src/LowerPHY/FFT/fft_cf32.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_execute.hpp"
 #include "fft_plan.hpp"
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
index 247de42..bf5176b 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gs.h"
 
@@ -2622,12 +2622,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs11(const armral_cmplx_f32_t *restrict x,
     float v416 = 5.5486073394528512e-01F;
     float v419 = 1.2412944743900585e+00F;
     float v420 = -1.2412944743900585e+00F;
-    float v426 = 2.0897833842005753e-01F;
-    float v427 = -2.0897833842005753e-01F;
-    float v433 = 3.7415717312460806e-01F;
-    float v434 = -3.7415717312460806e-01F;
-    float v440 = 4.9929922194110354e-02F;
-    float v441 = -4.9929922194110354e-02F;
+    float v426 = 2.0897833842005756e-01F;
+    float v427 = -2.0897833842005756e-01F;
+    float v433 = 3.7415717312460811e-01F;
+    float v434 = -3.7415717312460811e-01F;
+    float v440 = 4.9929922194110327e-02F;
+    float v441 = -4.9929922194110327e-02F;
     float v447 = 6.5815896284539266e-01F;
     float v448 = -6.5815896284539266e-01F;
     float v454 = 6.3306543373877577e-01F;
@@ -2923,9 +2923,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs11(const armral_cmplx_f32_t *restrict x,
     float v325 = 1.0702757469471715e+00F;
     float v330 = 5.5486073394528512e-01F;
     float v335 = -1.2412944743900585e+00F;
-    float v342 = -2.0897833842005753e-01F;
-    float v349 = -3.7415717312460806e-01F;
-    float v356 = -4.9929922194110354e-02F;
+    float v342 = -2.0897833842005756e-01F;
+    float v349 = -3.7415717312460811e-01F;
+    float v356 = -4.9929922194110327e-02F;
     float v363 = -6.5815896284539266e-01F;
     float v370 = -6.3306543373877577e-01F;
     float v377 = -1.0822460581641109e+00F;
@@ -12769,12 +12769,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x,
     float v977 = 5.5486073394528512e-01F;
     float v980 = 1.2412944743900585e+00F;
     float v981 = -1.2412944743900585e+00F;
-    float v987 = 2.0897833842005753e-01F;
-    float v988 = -2.0897833842005753e-01F;
-    float v994 = 3.7415717312460806e-01F;
-    float v995 = -3.7415717312460806e-01F;
-    float v1001 = 4.9929922194110354e-02F;
-    float v1002 = -4.9929922194110354e-02F;
+    float v987 = 2.0897833842005756e-01F;
+    float v988 = -2.0897833842005756e-01F;
+    float v994 = 3.7415717312460811e-01F;
+    float v995 = -3.7415717312460811e-01F;
+    float v1001 = 4.9929922194110327e-02F;
+    float v1002 = -4.9929922194110327e-02F;
     float v1008 = 6.5815896284539266e-01F;
     float v1009 = -6.5815896284539266e-01F;
     float v1015 = 6.3306543373877577e-01F;
@@ -13316,9 +13316,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x,
     float v787 = 1.0702757469471715e+00F;
     float v792 = 5.5486073394528512e-01F;
     float v797 = -1.2412944743900585e+00F;
-    float v804 = -2.0897833842005753e-01F;
-    float v811 = -3.7415717312460806e-01F;
-    float v818 = -4.9929922194110354e-02F;
+    float v804 = -2.0897833842005756e-01F;
+    float v811 = -3.7415717312460811e-01F;
+    float v818 = -4.9929922194110327e-02F;
     float v825 = -6.5815896284539266e-01F;
     float v832 = -6.3306543373877577e-01F;
     float v839 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
index 64be0bf..98033b7 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
index 1468fa5..23c5797 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ab_t_gu.h"
 
@@ -2531,12 +2531,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu11(const armral_cmplx_f32_t *restrict x,
     float v416 = 5.5486073394528512e-01F;
     float v419 = 1.2412944743900585e+00F;
     float v420 = -1.2412944743900585e+00F;
-    float v426 = 2.0897833842005753e-01F;
-    float v427 = -2.0897833842005753e-01F;
-    float v433 = 3.7415717312460806e-01F;
-    float v434 = -3.7415717312460806e-01F;
-    float v440 = 4.9929922194110354e-02F;
-    float v441 = -4.9929922194110354e-02F;
+    float v426 = 2.0897833842005756e-01F;
+    float v427 = -2.0897833842005756e-01F;
+    float v433 = 3.7415717312460811e-01F;
+    float v434 = -3.7415717312460811e-01F;
+    float v440 = 4.9929922194110327e-02F;
+    float v441 = -4.9929922194110327e-02F;
     float v447 = 6.5815896284539266e-01F;
     float v448 = -6.5815896284539266e-01F;
     float v454 = 6.3306543373877577e-01F;
@@ -2830,9 +2830,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu11(const armral_cmplx_f32_t *restrict x,
     float v325 = 1.0702757469471715e+00F;
     float v330 = 5.5486073394528512e-01F;
     float v335 = -1.2412944743900585e+00F;
-    float v342 = -2.0897833842005753e-01F;
-    float v349 = -3.7415717312460806e-01F;
-    float v356 = -4.9929922194110354e-02F;
+    float v342 = -2.0897833842005756e-01F;
+    float v349 = -3.7415717312460811e-01F;
+    float v356 = -4.9929922194110327e-02F;
     float v363 = -6.5815896284539266e-01F;
     float v370 = -6.3306543373877577e-01F;
     float v377 = -1.0822460581641109e+00F;
@@ -12458,12 +12458,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x,
     float v977 = 5.5486073394528512e-01F;
     float v980 = 1.2412944743900585e+00F;
     float v981 = -1.2412944743900585e+00F;
-    float v987 = 2.0897833842005753e-01F;
-    float v988 = -2.0897833842005753e-01F;
-    float v994 = 3.7415717312460806e-01F;
-    float v995 = -3.7415717312460806e-01F;
-    float v1001 = 4.9929922194110354e-02F;
-    float v1002 = -4.9929922194110354e-02F;
+    float v987 = 2.0897833842005756e-01F;
+    float v988 = -2.0897833842005756e-01F;
+    float v994 = 3.7415717312460811e-01F;
+    float v995 = -3.7415717312460811e-01F;
+    float v1001 = 4.9929922194110327e-02F;
+    float v1002 = -4.9929922194110327e-02F;
     float v1008 = 6.5815896284539266e-01F;
     float v1009 = -6.5815896284539266e-01F;
     float v1015 = 6.3306543373877577e-01F;
@@ -13003,9 +13003,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x,
     float v787 = 1.0702757469471715e+00F;
     float v792 = 5.5486073394528512e-01F;
     float v797 = -1.2412944743900585e+00F;
-    float v804 = -2.0897833842005753e-01F;
-    float v811 = -3.7415717312460806e-01F;
-    float v818 = -4.9929922194110354e-02F;
+    float v804 = -2.0897833842005756e-01F;
+    float v811 = -3.7415717312460811e-01F;
+    float v818 = -4.9929922194110327e-02F;
     float v825 = -6.5815896284539266e-01F;
     float v832 = -6.3306543373877577e-01F;
     float v839 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
index fe06a56..8edbe46 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
index d2b63f7..a61ff10 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_gu.h"
 
@@ -1747,12 +1747,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x,
     float v156 = 5.5486073394528512e-01F;
     float v159 = 1.2412944743900585e+00F;
     float v160 = -1.2412944743900585e+00F;
-    float v166 = 2.0897833842005753e-01F;
-    float v167 = -2.0897833842005753e-01F;
-    float v173 = 3.7415717312460806e-01F;
-    float v174 = -3.7415717312460806e-01F;
-    float v180 = 4.9929922194110354e-02F;
-    float v181 = -4.9929922194110354e-02F;
+    float v166 = 2.0897833842005756e-01F;
+    float v167 = -2.0897833842005756e-01F;
+    float v173 = 3.7415717312460811e-01F;
+    float v174 = -3.7415717312460811e-01F;
+    float v180 = 4.9929922194110327e-02F;
+    float v181 = -4.9929922194110327e-02F;
     float v187 = 6.5815896284539266e-01F;
     float v188 = -6.5815896284539266e-01F;
     float v194 = 6.3306543373877577e-01F;
@@ -1964,9 +1964,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x,
     float v185 = 1.0702757469471715e+00F;
     float v190 = 5.5486073394528512e-01F;
     float v195 = -1.2412944743900585e+00F;
-    float v202 = -2.0897833842005753e-01F;
-    float v209 = -3.7415717312460806e-01F;
-    float v216 = -4.9929922194110354e-02F;
+    float v202 = -2.0897833842005756e-01F;
+    float v209 = -3.7415717312460811e-01F;
+    float v216 = -4.9929922194110327e-02F;
     float v223 = -6.5815896284539266e-01F;
     float v230 = -6.3306543373877577e-01F;
     float v237 = -1.0822460581641109e+00F;
@@ -8899,12 +8899,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu22(const armral_cmplx_f32_t *restrict x,
     float v431 = 5.5486073394528512e-01F;
     float v434 = 1.2412944743900585e+00F;
     float v435 = -1.2412944743900585e+00F;
-    float v441 = 2.0897833842005753e-01F;
-    float v442 = -2.0897833842005753e-01F;
-    float v448 = 3.7415717312460806e-01F;
-    float v449 = -3.7415717312460806e-01F;
-    float v455 = 4.9929922194110354e-02F;
-    float v456 = -4.9929922194110354e-02F;
+    float v441 = 2.0897833842005756e-01F;
+    float v442 = -2.0897833842005756e-01F;
+    float v448 = 3.7415717312460811e-01F;
+    float v449 = -3.7415717312460811e-01F;
+    float v455 = 4.9929922194110327e-02F;
+    float v456 = -4.9929922194110327e-02F;
     float v462 = 6.5815896284539266e-01F;
     float v463 = -6.5815896284539266e-01F;
     float v469 = 6.3306543373877577e-01F;
@@ -9274,9 +9274,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu22(const armral_cmplx_f32_t *restrict x,
     float v493 = 1.0702757469471715e+00F;
     float v498 = 5.5486073394528512e-01F;
     float v503 = -1.2412944743900585e+00F;
-    float v510 = -2.0897833842005753e-01F;
-    float v517 = -3.7415717312460806e-01F;
-    float v524 = -4.9929922194110354e-02F;
+    float v510 = -2.0897833842005756e-01F;
+    float v517 = -3.7415717312460811e-01F;
+    float v524 = -4.9929922194110327e-02F;
     float v531 = -6.5815896284539266e-01F;
     float v538 = -6.3306543373877577e-01F;
     float v545 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
index 557619f..57014ea 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
index e4aed1b..cd7b9b1 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_n_uu.h"
 
@@ -2421,12 +2421,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v194 = 5.5486073394528512e-01F;
     float v198 = 1.2412944743900585e+00F;
     float v199 = -1.2412944743900585e+00F;
-    float v206 = 2.0897833842005753e-01F;
-    float v207 = -2.0897833842005753e-01F;
-    float v214 = 3.7415717312460806e-01F;
-    float v215 = -3.7415717312460806e-01F;
-    float v222 = 4.9929922194110354e-02F;
-    float v223 = -4.9929922194110354e-02F;
+    float v206 = 2.0897833842005756e-01F;
+    float v207 = -2.0897833842005756e-01F;
+    float v214 = 3.7415717312460811e-01F;
+    float v215 = -3.7415717312460811e-01F;
+    float v222 = 4.9929922194110327e-02F;
+    float v223 = -4.9929922194110327e-02F;
     float v230 = 6.5815896284539266e-01F;
     float v231 = -6.5815896284539266e-01F;
     float v238 = 6.3306543373877577e-01F;
@@ -2665,12 +2665,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v542 = 5.5486073394528512e-01F;
     float v545 = 1.2412944743900585e+00F;
     float v546 = -1.2412944743900585e+00F;
-    float v552 = 2.0897833842005753e-01F;
-    float v553 = -2.0897833842005753e-01F;
-    float v559 = 3.7415717312460806e-01F;
-    float v560 = -3.7415717312460806e-01F;
-    float v566 = 4.9929922194110354e-02F;
-    float v567 = -4.9929922194110354e-02F;
+    float v552 = 2.0897833842005756e-01F;
+    float v553 = -2.0897833842005756e-01F;
+    float v559 = 3.7415717312460811e-01F;
+    float v560 = -3.7415717312460811e-01F;
+    float v566 = 4.9929922194110327e-02F;
+    float v567 = -4.9929922194110327e-02F;
     float v573 = 6.5815896284539266e-01F;
     float v574 = -6.5815896284539266e-01F;
     float v580 = 6.3306543373877577e-01F;
@@ -2881,9 +2881,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v185 = 1.0702757469471715e+00F;
     float v190 = 5.5486073394528512e-01F;
     float v195 = -1.2412944743900585e+00F;
-    float v202 = -2.0897833842005753e-01F;
-    float v209 = -3.7415717312460806e-01F;
-    float v216 = -4.9929922194110354e-02F;
+    float v202 = -2.0897833842005756e-01F;
+    float v209 = -3.7415717312460811e-01F;
+    float v216 = -4.9929922194110327e-02F;
     float v223 = -6.5815896284539266e-01F;
     float v230 = -6.3306543373877577e-01F;
     float v237 = -1.0822460581641109e+00F;
@@ -12672,12 +12672,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v512 = 5.5486073394528512e-01F;
     float v516 = 1.2412944743900585e+00F;
     float v517 = -1.2412944743900585e+00F;
-    float v524 = 2.0897833842005753e-01F;
-    float v525 = -2.0897833842005753e-01F;
-    float v532 = 3.7415717312460806e-01F;
-    float v533 = -3.7415717312460806e-01F;
-    float v540 = 4.9929922194110354e-02F;
-    float v541 = -4.9929922194110354e-02F;
+    float v524 = 2.0897833842005756e-01F;
+    float v525 = -2.0897833842005756e-01F;
+    float v532 = 3.7415717312460811e-01F;
+    float v533 = -3.7415717312460811e-01F;
+    float v540 = 4.9929922194110327e-02F;
+    float v541 = -4.9929922194110327e-02F;
     float v548 = 6.5815896284539266e-01F;
     float v549 = -6.5815896284539266e-01F;
     float v556 = 6.3306543373877577e-01F;
@@ -13096,12 +13096,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v1212 = 5.5486073394528512e-01F;
     float v1215 = 1.2412944743900585e+00F;
     float v1216 = -1.2412944743900585e+00F;
-    float v1222 = 2.0897833842005753e-01F;
-    float v1223 = -2.0897833842005753e-01F;
-    float v1229 = 3.7415717312460806e-01F;
-    float v1230 = -3.7415717312460806e-01F;
-    float v1236 = 4.9929922194110354e-02F;
-    float v1237 = -4.9929922194110354e-02F;
+    float v1222 = 2.0897833842005756e-01F;
+    float v1223 = -2.0897833842005756e-01F;
+    float v1229 = 3.7415717312460811e-01F;
+    float v1230 = -3.7415717312460811e-01F;
+    float v1236 = 4.9929922194110327e-02F;
+    float v1237 = -4.9929922194110327e-02F;
     float v1243 = 6.5815896284539266e-01F;
     float v1244 = -6.5815896284539266e-01F;
     float v1250 = 6.3306543373877577e-01F;
@@ -13470,9 +13470,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v493 = 1.0702757469471715e+00F;
     float v498 = 5.5486073394528512e-01F;
     float v503 = -1.2412944743900585e+00F;
-    float v510 = -2.0897833842005753e-01F;
-    float v517 = -3.7415717312460806e-01F;
-    float v524 = -4.9929922194110354e-02F;
+    float v510 = -2.0897833842005756e-01F;
+    float v517 = -3.7415717312460811e-01F;
+    float v524 = -4.9929922194110327e-02F;
     float v531 = -6.5815896284539266e-01F;
     float v538 = -6.3306543373877577e-01F;
     float v545 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
index 00c743b..9b78818 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
index bab6391..53ef283 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cf32_ac_t_uu.h"
 
@@ -3449,12 +3449,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x,
     float v434 = 5.5486073394528512e-01F;
     float v438 = 1.2412944743900585e+00F;
     float v439 = -1.2412944743900585e+00F;
-    float v446 = 2.0897833842005753e-01F;
-    float v447 = -2.0897833842005753e-01F;
-    float v454 = 3.7415717312460806e-01F;
-    float v455 = -3.7415717312460806e-01F;
-    float v462 = 4.9929922194110354e-02F;
-    float v463 = -4.9929922194110354e-02F;
+    float v446 = 2.0897833842005756e-01F;
+    float v447 = -2.0897833842005756e-01F;
+    float v454 = 3.7415717312460811e-01F;
+    float v455 = -3.7415717312460811e-01F;
+    float v462 = 4.9929922194110327e-02F;
+    float v463 = -4.9929922194110327e-02F;
     float v470 = 6.5815896284539266e-01F;
     float v471 = -6.5815896284539266e-01F;
     float v478 = 6.3306543373877577e-01F;
@@ -3773,12 +3773,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x,
     float v982 = 5.5486073394528512e-01F;
     float v985 = 1.2412944743900585e+00F;
     float v986 = -1.2412944743900585e+00F;
-    float v992 = 2.0897833842005753e-01F;
-    float v993 = -2.0897833842005753e-01F;
-    float v999 = 3.7415717312460806e-01F;
-    float v1000 = -3.7415717312460806e-01F;
-    float v1006 = 4.9929922194110354e-02F;
-    float v1007 = -4.9929922194110354e-02F;
+    float v992 = 2.0897833842005756e-01F;
+    float v993 = -2.0897833842005756e-01F;
+    float v999 = 3.7415717312460811e-01F;
+    float v1000 = -3.7415717312460811e-01F;
+    float v1006 = 4.9929922194110327e-02F;
+    float v1007 = -4.9929922194110327e-02F;
     float v1013 = 6.5815896284539266e-01F;
     float v1014 = -6.5815896284539266e-01F;
     float v1020 = 6.3306543373877577e-01F;
@@ -4051,9 +4051,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x,
     float v265 = 1.0702757469471715e+00F;
     float v270 = 5.5486073394528512e-01F;
     float v275 = -1.2412944743900585e+00F;
-    float v282 = -2.0897833842005753e-01F;
-    float v289 = -3.7415717312460806e-01F;
-    float v296 = -4.9929922194110354e-02F;
+    float v282 = -2.0897833842005756e-01F;
+    float v289 = -3.7415717312460811e-01F;
+    float v296 = -4.9929922194110327e-02F;
     float v303 = -6.5815896284539266e-01F;
     float v310 = -6.3306543373877577e-01F;
     float v317 = -1.0822460581641109e+00F;
@@ -17320,12 +17320,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x,
     float v1016 = 5.5486073394528512e-01F;
     float v1020 = 1.2412944743900585e+00F;
     float v1021 = -1.2412944743900585e+00F;
-    float v1028 = 2.0897833842005753e-01F;
-    float v1029 = -2.0897833842005753e-01F;
-    float v1036 = 3.7415717312460806e-01F;
-    float v1037 = -3.7415717312460806e-01F;
-    float v1044 = 4.9929922194110354e-02F;
-    float v1045 = -4.9929922194110354e-02F;
+    float v1028 = 2.0897833842005756e-01F;
+    float v1029 = -2.0897833842005756e-01F;
+    float v1036 = 3.7415717312460811e-01F;
+    float v1037 = -3.7415717312460811e-01F;
+    float v1044 = 4.9929922194110327e-02F;
+    float v1045 = -4.9929922194110327e-02F;
     float v1052 = 6.5815896284539266e-01F;
     float v1053 = -6.5815896284539266e-01F;
     float v1060 = 6.3306543373877577e-01F;
@@ -17912,12 +17912,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x,
     float v2136 = 5.5486073394528512e-01F;
     float v2139 = 1.2412944743900585e+00F;
     float v2140 = -1.2412944743900585e+00F;
-    float v2146 = 2.0897833842005753e-01F;
-    float v2147 = -2.0897833842005753e-01F;
-    float v2153 = 3.7415717312460806e-01F;
-    float v2154 = -3.7415717312460806e-01F;
-    float v2160 = 4.9929922194110354e-02F;
-    float v2161 = -4.9929922194110354e-02F;
+    float v2146 = 2.0897833842005756e-01F;
+    float v2147 = -2.0897833842005756e-01F;
+    float v2153 = 3.7415717312460811e-01F;
+    float v2154 = -3.7415717312460811e-01F;
+    float v2160 = 4.9929922194110327e-02F;
+    float v2161 = -4.9929922194110327e-02F;
     float v2167 = 6.5815896284539266e-01F;
     float v2168 = -6.5815896284539266e-01F;
     float v2174 = 6.3306543373877577e-01F;
@@ -18414,9 +18414,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x,
     float v661 = 1.0702757469471715e+00F;
     float v666 = 5.5486073394528512e-01F;
     float v671 = -1.2412944743900585e+00F;
-    float v678 = -2.0897833842005753e-01F;
-    float v685 = -3.7415717312460806e-01F;
-    float v692 = -4.9929922194110354e-02F;
+    float v678 = -2.0897833842005756e-01F;
+    float v685 = -3.7415717312460811e-01F;
+    float v692 = -4.9929922194110327e-02F;
     float v699 = -6.5815896284539266e-01F;
     float v706 = -6.3306543373877577e-01F;
     float v713 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
index 6dc4485..37e39fd 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
index 5127454..86d9544 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ab_t_gu.h"
 
@@ -2909,12 +2909,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu11(const armral_cmplx_f32_t *restrict x,
     float v416 = 5.5486073394528512e-01F;
     float v419 = 1.2412944743900585e+00F;
     float v420 = -1.2412944743900585e+00F;
-    float v426 = 2.0897833842005753e-01F;
-    float v427 = -2.0897833842005753e-01F;
-    float v433 = 3.7415717312460806e-01F;
-    float v434 = -3.7415717312460806e-01F;
-    float v440 = 4.9929922194110354e-02F;
-    float v441 = -4.9929922194110354e-02F;
+    float v426 = 2.0897833842005756e-01F;
+    float v427 = -2.0897833842005756e-01F;
+    float v433 = 3.7415717312460811e-01F;
+    float v434 = -3.7415717312460811e-01F;
+    float v440 = 4.9929922194110327e-02F;
+    float v441 = -4.9929922194110327e-02F;
     float v447 = 6.5815896284539266e-01F;
     float v448 = -6.5815896284539266e-01F;
     float v454 = 6.3306543373877577e-01F;
@@ -3230,9 +3230,9 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu11(const armral_cmplx_f32_t *restrict x,
     float v325 = 1.0702757469471715e+00F;
     float v330 = 5.5486073394528512e-01F;
     float v335 = -1.2412944743900585e+00F;
-    float v342 = -2.0897833842005753e-01F;
-    float v349 = -3.7415717312460806e-01F;
-    float v356 = -4.9929922194110354e-02F;
+    float v342 = -2.0897833842005756e-01F;
+    float v349 = -3.7415717312460811e-01F;
+    float v356 = -4.9929922194110327e-02F;
     float v363 = -6.5815896284539266e-01F;
     float v370 = -6.3306543373877577e-01F;
     float v377 = -1.0822460581641109e+00F;
@@ -14068,12 +14068,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x,
     float v977 = 5.5486073394528512e-01F;
     float v980 = 1.2412944743900585e+00F;
     float v981 = -1.2412944743900585e+00F;
-    float v987 = 2.0897833842005753e-01F;
-    float v988 = -2.0897833842005753e-01F;
-    float v994 = 3.7415717312460806e-01F;
-    float v995 = -3.7415717312460806e-01F;
-    float v1001 = 4.9929922194110354e-02F;
-    float v1002 = -4.9929922194110354e-02F;
+    float v987 = 2.0897833842005756e-01F;
+    float v988 = -2.0897833842005756e-01F;
+    float v994 = 3.7415717312460811e-01F;
+    float v995 = -3.7415717312460811e-01F;
+    float v1001 = 4.9929922194110327e-02F;
+    float v1002 = -4.9929922194110327e-02F;
     float v1008 = 6.5815896284539266e-01F;
     float v1009 = -6.5815896284539266e-01F;
     float v1015 = 6.3306543373877577e-01F;
@@ -14657,9 +14657,9 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x,
     float v787 = 1.0702757469471715e+00F;
     float v792 = 5.5486073394528512e-01F;
     float v797 = -1.2412944743900585e+00F;
-    float v804 = -2.0897833842005753e-01F;
-    float v811 = -3.7415717312460806e-01F;
-    float v818 = -4.9929922194110354e-02F;
+    float v804 = -2.0897833842005756e-01F;
+    float v811 = -3.7415717312460811e-01F;
+    float v818 = -4.9929922194110327e-02F;
     float v825 = -6.5815896284539266e-01F;
     float v832 = -6.3306543373877577e-01F;
     float v839 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
index a540a90..aaba874 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
index fede572..33d7282 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cf32_cf32_cs16_ac_n_uu.h"
 
@@ -2853,12 +2853,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v194 = 5.5486073394528512e-01F;
     float v198 = 1.2412944743900585e+00F;
     float v199 = -1.2412944743900585e+00F;
-    float v206 = 2.0897833842005753e-01F;
-    float v207 = -2.0897833842005753e-01F;
-    float v214 = 3.7415717312460806e-01F;
-    float v215 = -3.7415717312460806e-01F;
-    float v222 = 4.9929922194110354e-02F;
-    float v223 = -4.9929922194110354e-02F;
+    float v206 = 2.0897833842005756e-01F;
+    float v207 = -2.0897833842005756e-01F;
+    float v214 = 3.7415717312460811e-01F;
+    float v215 = -3.7415717312460811e-01F;
+    float v222 = 4.9929922194110327e-02F;
+    float v223 = -4.9929922194110327e-02F;
     float v230 = 6.5815896284539266e-01F;
     float v231 = -6.5815896284539266e-01F;
     float v238 = 6.3306543373877577e-01F;
@@ -3108,12 +3108,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v553 = 5.5486073394528512e-01F;
     float v556 = 1.2412944743900585e+00F;
     float v557 = -1.2412944743900585e+00F;
-    float v563 = 2.0897833842005753e-01F;
-    float v564 = -2.0897833842005753e-01F;
-    float v570 = 3.7415717312460806e-01F;
-    float v571 = -3.7415717312460806e-01F;
-    float v577 = 4.9929922194110354e-02F;
-    float v578 = -4.9929922194110354e-02F;
+    float v563 = 2.0897833842005756e-01F;
+    float v564 = -2.0897833842005756e-01F;
+    float v570 = 3.7415717312460811e-01F;
+    float v571 = -3.7415717312460811e-01F;
+    float v577 = 4.9929922194110327e-02F;
+    float v578 = -4.9929922194110327e-02F;
     float v584 = 6.5815896284539266e-01F;
     float v585 = -6.5815896284539266e-01F;
     float v591 = 6.3306543373877577e-01F;
@@ -3346,9 +3346,9 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x,
     float v185 = 1.0702757469471715e+00F;
     float v190 = 5.5486073394528512e-01F;
     float v195 = -1.2412944743900585e+00F;
-    float v202 = -2.0897833842005753e-01F;
-    float v209 = -3.7415717312460806e-01F;
-    float v216 = -4.9929922194110354e-02F;
+    float v202 = -2.0897833842005756e-01F;
+    float v209 = -3.7415717312460811e-01F;
+    float v216 = -4.9929922194110327e-02F;
     float v223 = -6.5815896284539266e-01F;
     float v230 = -6.3306543373877577e-01F;
     float v237 = -1.0822460581641109e+00F;
@@ -14512,12 +14512,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v512 = 5.5486073394528512e-01F;
     float v516 = 1.2412944743900585e+00F;
     float v517 = -1.2412944743900585e+00F;
-    float v524 = 2.0897833842005753e-01F;
-    float v525 = -2.0897833842005753e-01F;
-    float v532 = 3.7415717312460806e-01F;
-    float v533 = -3.7415717312460806e-01F;
-    float v540 = 4.9929922194110354e-02F;
-    float v541 = -4.9929922194110354e-02F;
+    float v524 = 2.0897833842005756e-01F;
+    float v525 = -2.0897833842005756e-01F;
+    float v532 = 3.7415717312460811e-01F;
+    float v533 = -3.7415717312460811e-01F;
+    float v540 = 4.9929922194110327e-02F;
+    float v541 = -4.9929922194110327e-02F;
     float v548 = 6.5815896284539266e-01F;
     float v549 = -6.5815896284539266e-01F;
     float v556 = 6.3306543373877577e-01F;
@@ -14958,12 +14958,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v1234 = 5.5486073394528512e-01F;
     float v1237 = 1.2412944743900585e+00F;
     float v1238 = -1.2412944743900585e+00F;
-    float v1244 = 2.0897833842005753e-01F;
-    float v1245 = -2.0897833842005753e-01F;
-    float v1251 = 3.7415717312460806e-01F;
-    float v1252 = -3.7415717312460806e-01F;
-    float v1258 = 4.9929922194110354e-02F;
-    float v1259 = -4.9929922194110354e-02F;
+    float v1244 = 2.0897833842005756e-01F;
+    float v1245 = -2.0897833842005756e-01F;
+    float v1251 = 3.7415717312460811e-01F;
+    float v1252 = -3.7415717312460811e-01F;
+    float v1258 = 4.9929922194110327e-02F;
+    float v1259 = -4.9929922194110327e-02F;
     float v1265 = 6.5815896284539266e-01F;
     float v1266 = -6.5815896284539266e-01F;
     float v1272 = 6.3306543373877577e-01F;
@@ -15376,9 +15376,9 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x,
     float v493 = 1.0702757469471715e+00F;
     float v498 = 5.5486073394528512e-01F;
     float v503 = -1.2412944743900585e+00F;
-    float v510 = -2.0897833842005753e-01F;
-    float v517 = -3.7415717312460806e-01F;
-    float v524 = -4.9929922194110354e-02F;
+    float v510 = -2.0897833842005756e-01F;
+    float v517 = -3.7415717312460811e-01F;
+    float v524 = -4.9929922194110327e-02F;
     float v531 = -6.5815896284539266e-01F;
     float v538 = -6.3306543373877577e-01F;
     float v545 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
index 9dfd66c..8bbb2de 100644
--- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
index 531eba9..cac45e5 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cf32_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
index b0a10f3..9f0f294 100644
--- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp
index 50ac067..2da312a 100644
--- a/src/LowerPHY/FFT/fft_cs16.cpp
+++ b/src/LowerPHY/FFT/fft_cs16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
index e8a2aa0..ecb6566 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cf32_ac_n_uu.h"
 
@@ -955,12 +955,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v205 = 5.5486073394528512e-01F;
     float v209 = 1.2412944743900585e+00F;
     float v210 = -1.2412944743900585e+00F;
-    float v217 = 2.0897833842005753e-01F;
-    float v218 = -2.0897833842005753e-01F;
-    float v225 = 3.7415717312460806e-01F;
-    float v226 = -3.7415717312460806e-01F;
-    float v233 = 4.9929922194110354e-02F;
-    float v234 = -4.9929922194110354e-02F;
+    float v217 = 2.0897833842005756e-01F;
+    float v218 = -2.0897833842005756e-01F;
+    float v225 = 3.7415717312460811e-01F;
+    float v226 = -3.7415717312460811e-01F;
+    float v233 = 4.9929922194110327e-02F;
+    float v234 = -4.9929922194110327e-02F;
     float v241 = 6.5815896284539266e-01F;
     float v242 = -6.5815896284539266e-01F;
     float v249 = 6.3306543373877577e-01F;
@@ -1210,12 +1210,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v564 = 5.5486073394528512e-01F;
     float v567 = 1.2412944743900585e+00F;
     float v568 = -1.2412944743900585e+00F;
-    float v574 = 2.0897833842005753e-01F;
-    float v575 = -2.0897833842005753e-01F;
-    float v581 = 3.7415717312460806e-01F;
-    float v582 = -3.7415717312460806e-01F;
-    float v588 = 4.9929922194110354e-02F;
-    float v589 = -4.9929922194110354e-02F;
+    float v574 = 2.0897833842005756e-01F;
+    float v575 = -2.0897833842005756e-01F;
+    float v581 = 3.7415717312460811e-01F;
+    float v582 = -3.7415717312460811e-01F;
+    float v588 = 4.9929922194110327e-02F;
+    float v589 = -4.9929922194110327e-02F;
     float v595 = 6.5815896284539266e-01F;
     float v596 = -6.5815896284539266e-01F;
     float v602 = 6.3306543373877577e-01F;
@@ -1437,9 +1437,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v196 = 1.0702757469471715e+00F;
     float v201 = 5.5486073394528512e-01F;
     float v206 = -1.2412944743900585e+00F;
-    float v213 = -2.0897833842005753e-01F;
-    float v220 = -3.7415717312460806e-01F;
-    float v227 = -4.9929922194110354e-02F;
+    float v213 = -2.0897833842005756e-01F;
+    float v220 = -3.7415717312460811e-01F;
+    float v227 = -4.9929922194110327e-02F;
     float v234 = -6.5815896284539266e-01F;
     float v241 = -6.3306543373877577e-01F;
     float v248 = -1.0822460581641109e+00F;
@@ -11555,12 +11555,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v534 = 5.5486073394528512e-01F;
     float v538 = 1.2412944743900585e+00F;
     float v539 = -1.2412944743900585e+00F;
-    float v546 = 2.0897833842005753e-01F;
-    float v547 = -2.0897833842005753e-01F;
-    float v554 = 3.7415717312460806e-01F;
-    float v555 = -3.7415717312460806e-01F;
-    float v562 = 4.9929922194110354e-02F;
-    float v563 = -4.9929922194110354e-02F;
+    float v546 = 2.0897833842005756e-01F;
+    float v547 = -2.0897833842005756e-01F;
+    float v554 = 3.7415717312460811e-01F;
+    float v555 = -3.7415717312460811e-01F;
+    float v562 = 4.9929922194110327e-02F;
+    float v563 = -4.9929922194110327e-02F;
     float v570 = 6.5815896284539266e-01F;
     float v571 = -6.5815896284539266e-01F;
     float v578 = 6.3306543373877577e-01F;
@@ -12001,12 +12001,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v1256 = 5.5486073394528512e-01F;
     float v1259 = 1.2412944743900585e+00F;
     float v1260 = -1.2412944743900585e+00F;
-    float v1266 = 2.0897833842005753e-01F;
-    float v1267 = -2.0897833842005753e-01F;
-    float v1273 = 3.7415717312460806e-01F;
-    float v1274 = -3.7415717312460806e-01F;
-    float v1280 = 4.9929922194110354e-02F;
-    float v1281 = -4.9929922194110354e-02F;
+    float v1266 = 2.0897833842005756e-01F;
+    float v1267 = -2.0897833842005756e-01F;
+    float v1273 = 3.7415717312460811e-01F;
+    float v1274 = -3.7415717312460811e-01F;
+    float v1280 = 4.9929922194110327e-02F;
+    float v1281 = -4.9929922194110327e-02F;
     float v1287 = 6.5815896284539266e-01F;
     float v1288 = -6.5815896284539266e-01F;
     float v1294 = 6.3306543373877577e-01F;
@@ -12397,9 +12397,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v515 = 1.0702757469471715e+00F;
     float v520 = 5.5486073394528512e-01F;
     float v525 = -1.2412944743900585e+00F;
-    float v532 = -2.0897833842005753e-01F;
-    float v539 = -3.7415717312460806e-01F;
-    float v546 = -4.9929922194110354e-02F;
+    float v532 = -2.0897833842005756e-01F;
+    float v539 = -3.7415717312460811e-01F;
+    float v546 = -4.9929922194110327e-02F;
     float v553 = -6.5815896284539266e-01F;
     float v560 = -6.3306543373877577e-01F;
     float v567 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
index d4ff929..fe8b750 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
index e07c745..609bf1d 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_cs16_cf32_cs16_ac_n_uu.h"
 
@@ -3123,12 +3123,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v205 = 5.5486073394528512e-01F;
     float v209 = 1.2412944743900585e+00F;
     float v210 = -1.2412944743900585e+00F;
-    float v217 = 2.0897833842005753e-01F;
-    float v218 = -2.0897833842005753e-01F;
-    float v225 = 3.7415717312460806e-01F;
-    float v226 = -3.7415717312460806e-01F;
-    float v233 = 4.9929922194110354e-02F;
-    float v234 = -4.9929922194110354e-02F;
+    float v217 = 2.0897833842005756e-01F;
+    float v218 = -2.0897833842005756e-01F;
+    float v225 = 3.7415717312460811e-01F;
+    float v226 = -3.7415717312460811e-01F;
+    float v233 = 4.9929922194110327e-02F;
+    float v234 = -4.9929922194110327e-02F;
     float v241 = 6.5815896284539266e-01F;
     float v242 = -6.5815896284539266e-01F;
     float v249 = 6.3306543373877577e-01F;
@@ -3389,12 +3389,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v575 = 5.5486073394528512e-01F;
     float v578 = 1.2412944743900585e+00F;
     float v579 = -1.2412944743900585e+00F;
-    float v585 = 2.0897833842005753e-01F;
-    float v586 = -2.0897833842005753e-01F;
-    float v592 = 3.7415717312460806e-01F;
-    float v593 = -3.7415717312460806e-01F;
-    float v599 = 4.9929922194110354e-02F;
-    float v600 = -4.9929922194110354e-02F;
+    float v585 = 2.0897833842005756e-01F;
+    float v586 = -2.0897833842005756e-01F;
+    float v592 = 3.7415717312460811e-01F;
+    float v593 = -3.7415717312460811e-01F;
+    float v599 = 4.9929922194110327e-02F;
+    float v600 = -4.9929922194110327e-02F;
     float v606 = 6.5815896284539266e-01F;
     float v607 = -6.5815896284539266e-01F;
     float v613 = 6.3306543373877577e-01F;
@@ -3638,9 +3638,9 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x,
     float v196 = 1.0702757469471715e+00F;
     float v201 = 5.5486073394528512e-01F;
     float v206 = -1.2412944743900585e+00F;
-    float v213 = -2.0897833842005753e-01F;
-    float v220 = -3.7415717312460806e-01F;
-    float v227 = -4.9929922194110354e-02F;
+    float v213 = -2.0897833842005756e-01F;
+    float v220 = -3.7415717312460811e-01F;
+    float v227 = -4.9929922194110327e-02F;
     float v234 = -6.5815896284539266e-01F;
     float v241 = -6.3306543373877577e-01F;
     float v248 = -1.0822460581641109e+00F;
@@ -15662,12 +15662,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v534 = 5.5486073394528512e-01F;
     float v538 = 1.2412944743900585e+00F;
     float v539 = -1.2412944743900585e+00F;
-    float v546 = 2.0897833842005753e-01F;
-    float v547 = -2.0897833842005753e-01F;
-    float v554 = 3.7415717312460806e-01F;
-    float v555 = -3.7415717312460806e-01F;
-    float v562 = 4.9929922194110354e-02F;
-    float v563 = -4.9929922194110354e-02F;
+    float v546 = 2.0897833842005756e-01F;
+    float v547 = -2.0897833842005756e-01F;
+    float v554 = 3.7415717312460811e-01F;
+    float v555 = -3.7415717312460811e-01F;
+    float v562 = 4.9929922194110327e-02F;
+    float v563 = -4.9929922194110327e-02F;
     float v570 = 6.5815896284539266e-01F;
     float v571 = -6.5815896284539266e-01F;
     float v578 = 6.3306543373877577e-01F;
@@ -16130,12 +16130,12 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v1278 = 5.5486073394528512e-01F;
     float v1281 = 1.2412944743900585e+00F;
     float v1282 = -1.2412944743900585e+00F;
-    float v1288 = 2.0897833842005753e-01F;
-    float v1289 = -2.0897833842005753e-01F;
-    float v1295 = 3.7415717312460806e-01F;
-    float v1296 = -3.7415717312460806e-01F;
-    float v1302 = 4.9929922194110354e-02F;
-    float v1303 = -4.9929922194110354e-02F;
+    float v1288 = 2.0897833842005756e-01F;
+    float v1289 = -2.0897833842005756e-01F;
+    float v1295 = 3.7415717312460811e-01F;
+    float v1296 = -3.7415717312460811e-01F;
+    float v1302 = 4.9929922194110327e-02F;
+    float v1303 = -4.9929922194110327e-02F;
     float v1309 = 6.5815896284539266e-01F;
     float v1310 = -6.5815896284539266e-01F;
     float v1316 = 6.3306543373877577e-01F;
@@ -16570,9 +16570,9 @@ void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x,
     float v515 = 1.0702757469471715e+00F;
     float v520 = 5.5486073394528512e-01F;
     float v525 = -1.2412944743900585e+00F;
-    float v532 = -2.0897833842005753e-01F;
-    float v539 = -3.7415717312460806e-01F;
-    float v546 = -4.9929922194110354e-02F;
+    float v532 = -2.0897833842005756e-01F;
+    float v539 = -3.7415717312460811e-01F;
+    float v546 = -4.9929922194110327e-02F;
     float v553 = -6.5815896284539266e-01F;
     float v560 = -6.3306543373877577e-01F;
     float v567 = -1.0822460581641109e+00F;
diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
index 95273f4..163f863 100644
--- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
+++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
index c3163fc..20287bc 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_cs16_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
index 98a229f..8476f0e 100644
--- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
+++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp
index 420c70f..0231333 100644
--- a/src/LowerPHY/FFT/fft_execute.cpp
+++ b/src/LowerPHY/FFT/fft_execute.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "fft_execute.hpp"
diff --git a/src/LowerPHY/FFT/fft_execute.hpp b/src/LowerPHY/FFT/fft_execute.hpp
index 803a3d3..714d257 100644
--- a/src/LowerPHY/FFT/fft_execute.hpp
+++ b/src/LowerPHY/FFT/fft_execute.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_helper.h b/src/LowerPHY/FFT/fft_helper.h
index 9dfbc60..98f7c51 100644
--- a/src/LowerPHY/FFT/fft_helper.h
+++ b/src/LowerPHY/FFT/fft_helper.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_level.cpp b/src/LowerPHY/FFT/fft_level.cpp
index b5c45e1..a4402dc 100644
--- a/src/LowerPHY/FFT/fft_level.cpp
+++ b/src/LowerPHY/FFT/fft_level.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_level.hpp"
 
diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp
index 80bbdef..06cc0df 100644
--- a/src/LowerPHY/FFT/fft_level.hpp
+++ b/src/LowerPHY/FFT/fft_level.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp
index f6a2e56..833680a 100644
--- a/src/LowerPHY/FFT/fft_plan.cpp
+++ b/src/LowerPHY/FFT/fft_plan.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "fft_plan.hpp"
 #include "fft_cf32_kernel_lookup.h"
diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp
index d36715f..4196223 100644
--- a/src/LowerPHY/FFT/fft_plan.hpp
+++ b/src/LowerPHY/FFT/fft_plan.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/fft_types.hpp b/src/LowerPHY/FFT/fft_types.hpp
index 7d6d67d..65ebfa6 100644
--- a/src/LowerPHY/FFT/fft_types.hpp
+++ b/src/LowerPHY/FFT/fft_types.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp
index 309ebe8..a05479c 100644
--- a/src/LowerPHY/FFT/rader.cpp
+++ b/src/LowerPHY/FFT/rader.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rader.hpp"
diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp
index eacaabd..6d1d21f 100644
--- a/src/LowerPHY/FFT/rader.hpp
+++ b/src/LowerPHY/FFT/rader.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FFT/rader_generator.cpp b/src/LowerPHY/FFT/rader_generator.cpp
index b2c1c65..89e1386 100644
--- a/src/LowerPHY/FFT/rader_generator.cpp
+++ b/src/LowerPHY/FFT/rader_generator.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "rader_generator.hpp"
 
diff --git a/src/LowerPHY/FFT/rader_generator.hpp b/src/LowerPHY/FFT/rader_generator.hpp
index 30d6ca2..bc219d9 100644
--- a/src/LowerPHY/FFT/rader_generator.hpp
+++ b/src/LowerPHY/FFT/rader_generator.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32.c b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
index 6bbefd7..428e231 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -16,7 +16,7 @@ static inline svfloat32x4_t fir_sve_blk_4(svbool_t pg, const float *in,
   // Compute FIR for four vector-lengths of data. Coeffs array is
   // unrolled by 2 and we have 2 accumulators per vector length, as
   // explained in fir_sve_blk_2. In addition, loads and mlas are
-  // hand-interleaved in order to minimise latency.
+  // hand-interleaved in order to minimize latency.
 
   svfloat32_t y1_1 = svdup_f32(0);
   svfloat32_t y2_1 = svdup_f32(0);
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
index 05446b1..e60e5e0 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -127,7 +127,7 @@ static inline void sv_fir_block_4(svbool_t pg,
   // Compute FIR for 4 vector-lengths of data (read 8 vector-lengths, write 4).
   // Coeffs array is unrolled by 2 and we have 2 accumulators per vector length,
   // as explained in sv_fir_block_2. In addition, loads and mlas are
-  // hand-interleaved in order to minimise latency.
+  // hand-interleaved in order to minimize latency.
 
   const uint64_t *in = (const uint64_t *)input;
   svfloat32_t y_0_0 = svdup_f32(0);
@@ -235,7 +235,7 @@ static inline void sv_fir_block_8(svbool_t pg,
   // Unlike the previous 2 versions, we only need 1 accumulator per vector
   // length, as we have enough accumulators to hide the latency of ld2 and cmla
   // without needing to split them in half. Again, loads and mlas are
-  // hand-interleaved in order to minimise latency.
+  // hand-interleaved in order to minimize latency.
   const uint64_t *in = (const uint64_t *)input;
   svfloat32_t y_0 = svdup_f32(0);
   svfloat32_t y_1 = svdup_f32(0);
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16.c b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
index 850279b..71d03c1 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -438,11 +438,10 @@ armral_status armral_fir_filter_cs16(
   for (; i + svcntw() <= size; i += svcntw()) {
     sv_fir_block(input + i, coeffs, output + i, ptrue, taps);
   }
-  // Input array is not long enough to load svcntw() elements any more, so go 8
-  // elements at a time (the spec requires size of the input to be rounded up to
-  // a multiple of 8)
-  svbool_t pg = svwhilelt_b16(0U, size + taps);
-  for (; i + 8 <= size; i += 8) {
+  // Input array is not long enough to load svcntw() elements any more, so
+  // process the remaining elements with predication
+  if (i < size) {
+    svbool_t pg = svwhilelt_b16(i, size);
     sv_fir_block(input + i, coeffs, output + i, pg, taps);
   }
 #else
diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
index 52c32a6..0bb6947 100644
--- a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
+++ b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/LowerPHY/Scrambling/arm_scrambling.cpp b/src/LowerPHY/Scrambling/arm_scrambling.cpp
index f65c775..3ff12f6 100644
--- a/src/LowerPHY/Scrambling/arm_scrambling.cpp
+++ b/src/LowerPHY/Scrambling/arm_scrambling.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
index c56628d..98452a0 100644
--- a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
+++ b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp
@@ -1,9 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
+#ifdef ARMRAL_ARCH_SVE
+#include <arm_sve.h>
+#endif
+
 template<unsigned int N>
 static inline void generate_seq_128(uint64_t *x) {
   static_assert(N == 2);
@@ -92,13 +96,21 @@ armral_status armral_seq_generator(uint32_t sequence_len, uint32_t seed,
 
   // Tail
   if ((sequence_len % 64) != 0) {
-    uint8_t *p_out_tail = (uint8_t *)p_out;
-    uint64_t ptemp_res = x1 ^ x2;
     uint8_t tail_length = ((sequence_len % 64) + 7) >> 3;
+    uint64_t ptemp_res = x1 ^ x2;
+#ifdef ARMRAL_ARCH_SVE
+    svbool_t pg = svwhilelt_b8(0, tail_length);
+    svuint64_t splat_val = svdup_u64(ptemp_res);
+    svuint8_t splat_val8 = svreinterpret_u8_u64(splat_val);
+    svst1_u8(pg, (uint8_t *)p_out, splat_val8);
+#else
+    uint8_t *p_out_tail = (uint8_t *)p_out;
     for (uint32_t i = 0; i < tail_length; i++) {
       (*p_out_tail) = (uint8_t)(ptemp_res >> (i * 8));
       p_out_tail++;
     }
+#endif
   }
+
   return ARMRAL_SUCCESS;
 }
diff --git a/src/SVD/arm_svd.cpp b/src/SVD/arm_svd.cpp
index 1e4e6f2..1d24eeb 100644
--- a/src/SVD/arm_svd.cpp
+++ b/src/SVD/arm_svd.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -798,7 +798,7 @@ armral_status armral_svd_bidiagonal(bool gen_singular_vectors, int m, int n,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the marix A that has been factorised into QR or bidiagonalisation.
+// as the matrix A that has been factorised into QR or bidiagonalisation.
 struct apply_q_work_buffers {
   armral_cmplx_f32_t *q;
 };
diff --git a/src/SVD/matrix_view.hpp b/src/SVD/matrix_view.hpp
index 2bcef83..cc2c4d8 100644
--- a/src/SVD/matrix_view.hpp
+++ b/src/SVD/matrix_view.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
@@ -12,10 +12,6 @@ template<typename T>
 struct column_major_matrix_view {
   column_major_matrix_view(T *data, int stride)
     : m_data(data), m_stride(stride) {}
-#ifdef SVD_TEST
-  column_major_matrix_view(std::vector<T> &data, int stride)
-    : column_major_matrix_view(data.data(), stride) {}
-#endif
 
   T &operator()(int i, int j) {
     return m_data[i + stride() * j];
diff --git a/src/UpperPHY/CRC/arm_crc11.cpp b/src/UpperPHY/CRC/arm_crc11.cpp
index 3104b47..c65f3b1 100644
--- a/src/UpperPHY/CRC/arm_crc11.cpp
+++ b/src/UpperPHY/CRC/arm_crc11.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc11_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0xa080000000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0xb3fa1f48b92fa293,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0xc420000000000000, 0xc420000000000000, 0x5e60000000000000,
diff --git a/src/UpperPHY/CRC/arm_crc16.cpp b/src/UpperPHY/CRC/arm_crc16.cpp
index 5029eae..42204c3 100644
--- a/src/UpperPHY/CRC/arm_crc16.cpp
+++ b/src/UpperPHY/CRC/arm_crc16.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc16_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0x8420000000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0x11303471a041b343,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0x1021000000000000, 0x1021000000000000, 0xeb23000000000000,
diff --git a/src/UpperPHY/CRC/arm_crc24_a.cpp b/src/UpperPHY/CRC/arm_crc24_a.cpp
index d02d2ee..3eac9c4 100644
--- a/src/UpperPHY/CRC/arm_crc24_a.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_a.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc24_a_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0xa38dea0000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0xf845fe2493242da4,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0x864cfb0000000000, 0x864cfb0000000000, 0xfd7e0c0000000000,
diff --git a/src/UpperPHY/CRC/arm_crc24_b.cpp b/src/UpperPHY/CRC/arm_crc24_b.cpp
index 1385524..6de6116 100644
--- a/src/UpperPHY/CRC/arm_crc24_b.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_b.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc24_b_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0xdf24f50000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0xffff83ffe007f83e,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0x8000630000000000, 0x8000630000000000, 0x0900020000000000,
diff --git a/src/UpperPHY/CRC/arm_crc24_c.cpp b/src/UpperPHY/CRC/arm_crc24_c.cpp
index f1f52fa..0e5e4a7 100644
--- a/src/UpperPHY/CRC/arm_crc24_c.cpp
+++ b/src/UpperPHY/CRC/arm_crc24_c.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc24_c_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0x563dff0000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0xc52cdcad524ab8e3,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0xb2b1170000000000, 0xb2b1170000000000, 0x1397990000000000,
diff --git a/src/UpperPHY/CRC/arm_crc6.cpp b/src/UpperPHY/CRC/arm_crc6.cpp
index c9faf08..f907683 100644
--- a/src/UpperPHY/CRC/arm_crc6.cpp
+++ b/src/UpperPHY/CRC/arm_crc6.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "crc_common.hpp"
@@ -9,7 +9,7 @@
 static const poly64_t crc6_data[] = {
     // (1<<(64*k)) mod P_CRC, for k = 10
     0x5400000000000000,
-    // (1<<128) / P - (1<<64)
+    // (1<<128) / P_CRC - (1<<64)
     0xfab376938bca3083,
     // (1<<(64*k)) mod P_CRC, for k in [1,1,2,3,4,5,6,7,8,9]
     0x8400000000000000, 0x8400000000000000, 0x8c00000000000000,
diff --git a/src/UpperPHY/CRC/crc_basic.hpp b/src/UpperPHY/CRC/crc_basic.hpp
index 06181ef..7c3dfcd 100644
--- a/src/UpperPHY/CRC/crc_basic.hpp
+++ b/src/UpperPHY/CRC/crc_basic.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp
index 39289e8..59460e0 100644
--- a/src/UpperPHY/CRC/crc_common.hpp
+++ b/src/UpperPHY/CRC/crc_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -64,16 +64,17 @@ static inline poly64x2_t add_p64x2(poly64x2_t a, poly64x2_t b) {
  * @tparam     BarretShift     the shift used when computing @c ls1_divp.
  * @param[in]  size            number of bytes of the given buffer
  * @param[in]  input           points to the input byte sequence
- * @param[out] crc24           the computed CRC on 24 bits
+ * @param[out] crc             the computed CRC
  * @param[in]  constants       the constants specific to each polynomial:
                                constants[0] = padding
-                               constants[1] = (1<<128) / P - (1<<64)
-                               constants[2:11] = [ (1<<(64*k)) mod P,
+                               constants[1] = (1<<128) / P_CRC - (1<<64)
+                               constants[2:11] = [ (1<<(64*k)) mod P_CRC,
                                  for k in [1,1,2,3,4,5,6,7,8,9] ]
  */
 template<char Endianness>
-static void crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
-                  const poly64_t constants[]) {
+static inline __attribute__((always_inline)) void
+crc64(uint32_t size, const uint64_t *input, uint64_t *crc,
+      const poly64_t constants[]) {
   const poly64_t *p_in = (const poly64_t *)input;
 
   if (size == 8) {
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
index 1a6f71b..43cd7da 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -40,23 +40,6 @@ void compute_path(uint8_t *dec, uint32_t k, uint8_t states, uint8_t const *prev,
   *i_ptr = i;
 }
 
-[[maybe_unused]] int32_t compute_bm(int8_t s0, int8_t s1, int8_t s2, int8_t t0,
-                                    int8_t t1, int8_t t2) {
-  int32_t bm = 0;
-
-  // In the AWGN case, minimizing the branch metric bm = sum_{i=1}^3 d_i^2
-  // is identical to maximizing the log likelihood along different paths, since
-  // LL is proportional to -(sum_{i=1}^3 d_i^2).
-
-  bm += abs(s0 - t0);
-
-  bm += abs(s1 - t1);
-
-  bm += abs(s2 - t2);
-
-  return bm;
-}
-
 int cmp(const void *a, const void *b) {
   int ret;
   const pm_s ia = *(const pm_s *)a;
diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
index 19fe7ad..c936f70 100644
--- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
+++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
index 6fe6dc1..469b252 100644
--- a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
+++ b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 namespace {
diff --git a/src/UpperPHY/Demodulation/arm_demodulation.c b/src/UpperPHY/Demodulation/arm_demodulation.c
index 8046a4c..2a30828 100644
--- a/src/UpperPHY/Demodulation/arm_demodulation.c
+++ b/src/UpperPHY/Demodulation/arm_demodulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp
index 4b2739b..0d4fa9b 100644
--- a/src/UpperPHY/LDPC/ldpc_coding.hpp
+++ b/src/UpperPHY/LDPC/ldpc_coding.hpp
@@ -1,12 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
 #include "armral.h"
 
-namespace armral_ldpc {
+namespace armral::ldpc {
+
 constexpr uint32_t num_lifting_sets = 8;
 
 uint32_t get_ldpc_lifting_index(uint32_t lifting_size);
@@ -16,4 +17,4 @@ void decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
                   uint32_t crc_idx, uint32_t num_its, uint8_t *data_out,
                   Allocator &allocator);
 
-} // namespace armral_ldpc
+} // namespace armral::ldpc
diff --git a/src/UpperPHY/LDPC/ldpc_decoder.cpp b/src/UpperPHY/LDPC/ldpc_decoder.cpp
index 99e26ef..1948000 100644
--- a/src/UpperPHY/LDPC/ldpc_decoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "bit_utils.hpp"
@@ -38,7 +38,7 @@ struct ldpc_layer_data {
     row_start_ind = graph->row_start_inds[row];
     col_ptr += num_cols;
     num_cols = graph->row_start_inds[row + 1] - row_start_ind;
-    shift_ptr = graph->shifts + row_start_ind * armral_ldpc::num_lifting_sets +
+    shift_ptr = graph->shifts + row_start_ind * armral::ldpc::num_lifting_sets +
                 lsi * num_cols;
   }
 };
@@ -113,13 +113,13 @@ private:
 template<lifting_size_category>
 bool parity_check(const int8_t *llrs, uint32_t z, uint32_t lsi,
                   const armral_ldpc_base_graph_t *graph, int32_t num_lanes,
-                  int32_t full_vec, int32_t tail_size);
+                  int32_t full_vec, int32_t tail_size, int8_t *check);
 
 template<>
 bool parity_check<CAT_TINY>(const int8_t *llrs, uint32_t z, uint32_t lsi,
                             const armral_ldpc_base_graph_t *graph,
                             int32_t num_lanes, int32_t full_vec,
-                            int32_t tail_size) {
+                            int32_t tail_size, int8_t *check_array) {
   // Loop through the rows in the base graph
   bool passed = true;
   for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
@@ -127,18 +127,18 @@ bool parity_check<CAT_TINY>(const int8_t *llrs, uint32_t z, uint32_t lsi,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
     // Loop through the rows in the block
     for (uint32_t zb = 0; zb < z && passed; ++zb) {
       // Loop through the columns in the row
-      int8_t check = 0;
+      int8_t scal_check = 0;
       for (uint32_t col = 0; col < num_cols; ++col) {
         auto shift = (shift_ptr[col] + zb) % z;
         auto codeword_ind = col_ptr[col] * z + shift;
-        check ^= llrs[codeword_ind];
+        scal_check ^= llrs[codeword_ind];
       }
-      passed &= check >= 0;
+      passed &= scal_check >= 0;
     }
   }
   return passed;
@@ -148,7 +148,7 @@ template<>
 bool parity_check<CAT_TAIL>(const int8_t *llrs, uint32_t z, uint32_t lsi,
                             const armral_ldpc_base_graph_t *graph,
                             int32_t num_lanes, int32_t full_vec,
-                            int32_t tail_size) {
+                            int32_t tail_size, int8_t *check) {
   // Loop through the rows in the base graph
   bool passed = true;
 #if ARMRAL_ARCH_SVE >= 2
@@ -158,9 +158,8 @@ bool parity_check<CAT_TAIL>(const int8_t *llrs, uint32_t z, uint32_t lsi,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
-    int8_t check[z];
     memset(check, 0, z * sizeof(int8_t));
 
     // Loop through the columns
@@ -187,9 +186,8 @@ bool parity_check<CAT_TAIL>(const int8_t *llrs, uint32_t z, uint32_t lsi,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
-    int8_t check[z];
     memset(check, 0, z * sizeof(int8_t));
 
     // Loop through the columns
@@ -223,7 +221,7 @@ template<>
 bool parity_check<CAT_LARGE>(const int8_t *llrs, uint32_t z, uint32_t lsi,
                              const armral_ldpc_base_graph_t *graph,
                              int32_t num_lanes, int32_t full_vec,
-                             int32_t tail_size) {
+                             int32_t tail_size, int8_t *check) {
 #if ARMRAL_ARCH_SVE >= 2
   svbool_t pg = svptrue_b8();
   svbool_t pg_tail = svwhilelt_b8(0, tail_size);
@@ -235,9 +233,8 @@ bool parity_check<CAT_LARGE>(const int8_t *llrs, uint32_t z, uint32_t lsi,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
-    int8_t check[z];
     memset(check, 0, z * sizeof(int8_t));
 
     // Loop through the columns
@@ -285,9 +282,8 @@ bool parity_check<CAT_LARGE>(const int8_t *llrs, uint32_t z, uint32_t lsi,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
-    int8_t check[z];
     memset(check, 0, z * sizeof(int8_t));
 
     // Loop through the columns
@@ -532,7 +528,6 @@ void compute_l_product_min1_and_min2<CAT_TAIL>(
 
   vst1_s8(l_ptr, l_reg);
 
-  int8_t scalar_min2[d->z];
   for (uint32_t zb = d->z - tail_size; zb < d->z; ++zb) {
     l_val = llrs_ptr[zb] - r_ptr[zb];
 
@@ -576,7 +571,7 @@ void compute_l_product_min1_and_min2<CAT_TAIL>(
 
       int8_t abs_val = vqabsb_s8(l_val);
       row_min2_array[zb] =
-          max(row_min_array[zb], min(scalar_min2[zb], abs_val));
+          max(row_min_array[zb], min(row_min2_array[zb], abs_val));
       row_min_array[zb] = min(row_min_array[zb], abs_val);
 
       l_ptr[zb] = l_val;
@@ -1301,7 +1296,7 @@ run_iterations(uint32_t num_its, int z, int lsi,
                const armral_ldpc_base_graph_t *graph, int8_t *r, int8_t *l,
                int8_t *new_llrs, int num_lanes, int full_vec, int tail_size,
                int8_t *row_min_array, int8_t *row_min2_array,
-               int8_t *row_sign_array, bool check_convergence,
+               int8_t *row_sign_array, int8_t *check, bool check_convergence,
                std::optional<crc_checker<Allocator>> &crc_checker) {
   for (uint32_t i = 0; i < num_its; ++i) {
     ldpc_layer_data d(z, lsi, graph);
@@ -1333,7 +1328,7 @@ run_iterations(uint32_t num_its, int z, int lsi,
     bool crc_passed = crc_checker.has_value() && crc_checker->check(new_llrs);
     if (check_convergence &&
         (crc_passed || parity_check<Cat>(new_llrs, z, lsi, graph, num_lanes,
-                                         full_vec, tail_size))) {
+                                         full_vec, tail_size, check))) {
       break;
     }
   }
@@ -1342,12 +1337,12 @@ run_iterations(uint32_t num_its, int z, int lsi,
 } // anonymous namespace
 
 template<bool check_convergence, typename Allocator>
-void armral_ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
-                               uint32_t z, uint32_t crc_idx, uint32_t num_its,
-                               uint8_t *data_out, Allocator &allocator) {
+void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
+                                uint32_t z, uint32_t crc_idx, uint32_t num_its,
+                                uint8_t *data_out, Allocator &allocator) {
   // Get the base graph and the lifting size
   const auto *graph = armral_ldpc_get_base_graph(bg);
-  uint32_t lsi = armral_ldpc::get_ldpc_lifting_index(z);
+  uint32_t lsi = armral::ldpc::get_ldpc_lifting_index(z);
 
   // Only allocate the CRC checker if necessary.
   std::optional<crc_checker<Allocator>> maybe_crc_checker;
@@ -1373,6 +1368,8 @@ void armral_ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
   auto row_min2_array = allocate_zeroed<int8_t>(allocator, z);
   auto row_sign_array = allocate_zeroed<int8_t>(allocator, z);
 
+  auto check = allocate_zeroed<int8_t>(allocator, z);
+
 #if ARMRAL_ARCH_SVE >= 2
   bool z_is_tiny = (z == 2);
 #else
@@ -1435,24 +1432,27 @@ void armral_ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
 #endif
 
   if (z_is_tiny) {
-    run_iterations<CAT_TINY>(
-        num_its, z, lsi, graph, r.get(), l.get(), new_llrs.get(), num_lanes,
-        full_vec, tail_size, row_min_array.get(), row_min2_array.get(),
-        row_sign_array.get(), check_convergence, maybe_crc_checker);
+    run_iterations<CAT_TINY>(num_its, z, lsi, graph, r.get(), l.get(),
+                             new_llrs.get(), num_lanes, full_vec, tail_size,
+                             row_min_array.get(), row_min2_array.get(),
+                             row_sign_array.get(), check.get(),
+                             check_convergence, maybe_crc_checker);
 
     // Hard decode into the output variable
     llrs_to_bits(num_llrs, new_llrs.get(), data_out);
   } else {
     if (is_tail_only) {
-      run_iterations<CAT_TAIL>(
-          num_its, z, lsi, graph, r.get(), l.get(), new_llrs.get(), num_lanes,
-          full_vec, tail_size, row_min_array.get(), row_min2_array.get(),
-          row_sign_array.get(), check_convergence, maybe_crc_checker);
+      run_iterations<CAT_TAIL>(num_its, z, lsi, graph, r.get(), l.get(),
+                               new_llrs.get(), num_lanes, full_vec, tail_size,
+                               row_min_array.get(), row_min2_array.get(),
+                               row_sign_array.get(), check.get(),
+                               check_convergence, maybe_crc_checker);
     } else {
-      run_iterations<CAT_LARGE>(
-          num_its, z, lsi, graph, r.get(), l.get(), new_llrs.get(), num_lanes,
-          full_vec, tail_size, row_min_array.get(), row_min2_array.get(),
-          row_sign_array.get(), check_convergence, maybe_crc_checker);
+      run_iterations<CAT_LARGE>(num_its, z, lsi, graph, r.get(), l.get(),
+                                new_llrs.get(), num_lanes, full_vec, tail_size,
+                                row_min_array.get(), row_min2_array.get(),
+                                row_sign_array.get(), check.get(),
+                                check_convergence, maybe_crc_checker);
     }
     // Pack LLRs, copy back to original storage
     auto *out_llrs = maybe_out_llrs.value().get();
@@ -1467,11 +1467,11 @@ void armral_ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
   }
 }
 
-template void armral_ldpc::decode_block<false, heap_allocator>(
+template void armral::ldpc::decode_block<false, heap_allocator>(
     const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
     uint32_t num_its, uint8_t *data_out, heap_allocator &);
 
-template void armral_ldpc::decode_block<false, buffer_bump_allocator>(
+template void armral::ldpc::decode_block<false, buffer_bump_allocator>(
     const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
     uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &);
 
@@ -1480,8 +1480,8 @@ armral_status armral_ldpc_decode_block(const int8_t *llrs,
                                        uint32_t crc_idx, uint32_t num_its,
                                        uint8_t *data_out) {
   heap_allocator allocator{};
-  armral_ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
-                                  allocator);
+  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
+                                   allocator);
   return ARMRAL_SUCCESS;
 }
 
@@ -1490,8 +1490,8 @@ armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg,
                                  uint32_t z, uint32_t crc_idx, uint32_t num_its,
                                  uint8_t *data_out, void *buffer) {
   buffer_bump_allocator allocator{buffer};
-  armral_ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
-                                  allocator);
+  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
+                                   allocator);
   return ARMRAL_SUCCESS;
 }
 
@@ -1500,7 +1500,7 @@ uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
                                                       uint32_t crc_idx,
                                                       uint32_t num_its) {
   counting_allocator allocator{};
-  armral_ldpc::decode_block<true>(nullptr, bg, z, crc_idx, num_its, nullptr,
-                                  allocator);
+  armral::ldpc::decode_block<true>(nullptr, bg, z, crc_idx, num_its, nullptr,
+                                   allocator);
   return allocator.required_bytes();
 }
diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/ldpc_encoder.cpp
index 20ee82b..74a8fe4 100644
--- a/src/UpperPHY/LDPC/ldpc_encoder.cpp
+++ b/src/UpperPHY/LDPC/ldpc_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -950,7 +950,7 @@ inline void set_parity_hdsm_bg1_lsi_not_6(uint32_t z,
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
 
-  // Process 16 entries  at a time
+  // Process 16 entries at a time
   uint32_t blk_cnt = (z - 1) >> 4U;
 
   while (blk_cnt > 0U) {
@@ -1036,7 +1036,7 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
     const uint8_t *ptr_hdsm = parity_hdsm;
     // zb = 0 to 104
 
-    // Process 16 uint8_t  at a time
+    // Process 16 uint8_t at a time
     uint32_t blk_cnt = 6; // 105/16
     while (blk_cnt > 0U) {
 
@@ -1103,7 +1103,7 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
     data_out += 1;
 
     // zb = 105 to 207
-    // Process 16 uint8_t  at a time
+    // Process 16 uint8_t at a time
     blk_cnt = 6; // 103/16
     while (blk_cnt > 0U) {
 
@@ -1140,7 +1140,7 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
       codeword[(25 * z) + zb] = parity_hdsm[3 * z + zb] ^ agg_parity[zb - 105];
     }
   } else { // z != 208
-    // Deal with the first row  of the loop (zb =0)
+    // Deal with the first row of the loop (zb =0)
     {
       codeword[22 * z] = agg_parity[z - 1];
       codeword[23 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
@@ -1154,7 +1154,7 @@ inline void set_parity_hdsm_bg1_lsi_6(uint32_t z, const uint8_t *parity_hdsm,
     const uint8_t *ptr_agg = agg_parity + 1;
     const uint8_t *ptr_hdsm = parity_hdsm + 1;
 
-    // Process 16 uint8_t  at a time
+    // Process 16 uint8_t at a time
     uint32_t blk_cnt = (z - 1) >> 4U;
     while (blk_cnt > 0U) {
 
@@ -1225,7 +1225,7 @@ inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
                                                 const uint8_t *parity_hdsm,
                                                 const uint8_t *agg_parity,
                                                 uint8_t *codeword) {
-  // Deal with the first row  of the loop (zb =0)
+  // Deal with the first row of the loop (zb =0)
   {
     codeword[10 * z] = agg_parity[z - 1];
     codeword[11 * z] = parity_hdsm[0] ^ agg_parity[z - 1];
@@ -1237,7 +1237,7 @@ inline void set_parity_hdsm_bg2_lsi_not_3_nor_7(uint32_t z,
   const uint8_t *ptr_agg = agg_parity + 1;
   const uint8_t *ptr_hdsm = parity_hdsm + 1;
 
-  // Process 16 entries  at a time
+  // Process 16 entries at a time
   uint32_t blk_cnt = (z - 1) >> 4U;
 
   while (blk_cnt > 0U) {
@@ -1311,7 +1311,7 @@ inline void set_parity_hdsm_bg2_lsi_3_or_7(uint32_t z,
   const uint8_t *ptr_agg = agg_parity;
   const uint8_t *ptr_hdsm = parity_hdsm;
 
-  // Process 16 entries  at a time
+  // Process 16 entries at a time
   uint32_t blk_cnt = (z - 1) >> 4U;
 
   while (blk_cnt > 0U) {
@@ -1467,7 +1467,7 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
     // the number of index sets (8), and then the lifting set index
     // is added to this
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * col_entries;
     uint32_t j = 0;
     for (; j < col_entries && col_ptr[j] < max_ind; ++j) {
@@ -1480,7 +1480,7 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
       auto *codeword_ptr = codeword + block_col * z + shift;
 
       // Vectorization of the inner loops
-      // Process 16 entries  at a time
+      // Process 16 entries at a time
       uint32_t blk_cnt = (z - shift) >> 4U;
       while (blk_cnt > 0U) {
         // Load inputs
@@ -1514,7 +1514,7 @@ inline void calc_extension_parity(uint32_t z, uint32_t lsi,
       // Process zb = 0 to shift -1
       codeword_ptr = codeword + block_col * z;
 
-      // Process 16 entries  at a time
+      // Process 16 entries at a time
       blk_cnt = shift >> 4U;
       while (blk_cnt > 0U) {
         // Load inputs
@@ -1562,7 +1562,7 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi,
     // is first offset by the row start index multiplied by
     // the number of index sets (8), and then
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * col_entries;
     uint32_t j = 0;
     for (; j < col_entries && col_ptr[j] < graph->nmessage_bits; ++j) {
@@ -1575,7 +1575,7 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi,
       auto *in_ptr = bytes_in + block_col * z + shift;
 
       // Vectorization of the inner loops
-      // Process 16 entries  at a time
+      // Process 16 entries at a time
       uint32_t blk_cnt = (z - shift) >> 4U;
       while (blk_cnt > 0U) {
         // Load inputs
@@ -1609,7 +1609,7 @@ inline void spmv_hdsm(uint32_t z, uint32_t lsi,
       // Process zb = 0 to shift - 1
       in_ptr = bytes_in + block_col * z;
 
-      // Process 16 entries  at a time
+      // Process 16 entries at a time
       blk_cnt = shift >> 4U;
       while (blk_cnt > 0U) {
         // Load inputs
@@ -1652,7 +1652,7 @@ inline void copy_input_message(uint32_t z,
     uint8_t *out_ptr = codeword + j * z;
     const uint8_t *in_ptr = bytes_in + j * z;
 
-    // Process 16 entries  at a time
+    // Process 16 entries at a time
     uint32_t blk_cnt = z >> 4U;
     while (blk_cnt > 0U) {
       // Load inputs
@@ -1686,7 +1686,7 @@ inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
   // First iteration, tmp_parity is vector of 0
   uint8_t *out_ptr = tmp_parity;
   const uint8_t *in_ptr = parity_hdsm;
-  // Process 16 entries  at a time
+  // Process 16 entries at a time
   uint32_t blk_cnt = z >> 4U;
   while (blk_cnt > 0U) {
     // Load inputs
@@ -1721,7 +1721,7 @@ inline void calc_hdsm_rhs(uint32_t z, const uint8_t *parity_hdsm,
   for (uint32_t j = 1; j < 4; ++j) {
     out_ptr = tmp_parity;
     in_ptr = parity_hdsm + z * j;
-    // Process 16 entries  at a time
+    // Process 16 entries at a time
     blk_cnt = z >> 4U;
     while (blk_cnt > 0U) {
       // Load inputs
@@ -1767,6 +1767,7 @@ armral_status ldpc_encode_block(const uint8_t *data_in, armral_ldpc_graph_t bg,
   auto parity_hdsm = allocate_zeroed<uint8_t>(allocator, 4 * z);
   auto codeword =
       allocate_zeroed<uint8_t>(allocator, (graph->ncodeword_bits + 2) * z);
+  auto tmp_parity = allocate_zeroed<uint8_t>(allocator, z);
 
   if constexpr (Allocator::is_counting) {
     return ARMRAL_SUCCESS;
@@ -1777,7 +1778,7 @@ armral_status ldpc_encode_block(const uint8_t *data_in, armral_ldpc_graph_t bg,
                 bytes_in.get());
 
   // Get the lifting set index
-  auto lsi = armral_ldpc::get_ldpc_lifting_index(z);
+  auto lsi = armral::ldpc::get_ldpc_lifting_index(z);
 
   // The encoding is done by computing:
   // 1- Parity bits for the high-density submatrix (hdsm)
@@ -1792,11 +1793,10 @@ armral_status ldpc_encode_block(const uint8_t *data_in, armral_ldpc_graph_t bg,
 
   // Build the right-hand side of the linear systems
   // to solve for hdsm parity computation
-  uint8_t tmp_parity[z];
-  calc_hdsm_rhs(z, parity_hdsm.get(), tmp_parity);
+  calc_hdsm_rhs(z, parity_hdsm.get(), tmp_parity.get());
 
   // Finally, computation of hdsm parity bits
-  calc_hdsm_parity(z, lsi, bg, graph, parity_hdsm.get(), tmp_parity,
+  calc_hdsm_parity(z, lsi, bg, graph, parity_hdsm.get(), tmp_parity.get(),
                    codeword.get());
 
   // 2- Parity bits for the extension matrix.
@@ -1848,7 +1848,7 @@ armral_ldpc_get_base_graph(armral_ldpc_graph_t bg) {
   return bg == LDPC_BASE_GRAPH_1 ? &base_graph_1 : &base_graph_2;
 }
 
-uint32_t armral_ldpc::get_ldpc_lifting_index(uint32_t lifting_size) {
+uint32_t armral::ldpc::get_ldpc_lifting_index(uint32_t lifting_size) {
   // Each lifting size is either a power of two,
   // or an odd multiple (up to 15) of a power of two. Find the first odd
   // number when shifting right,
diff --git a/src/UpperPHY/LDPC/ldpc_rate_common.hpp b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
new file mode 100644
index 0000000..3858f49
--- /dev/null
+++ b/src/UpperPHY/LDPC/ldpc_rate_common.hpp
@@ -0,0 +1,30 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+#pragma once
+
+#include "armral.h"
+
+namespace {
+
+uint32_t starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
+                           uint32_t ncb, uint32_t z) {
+  // Starting position k0 of different redundancy versions
+  // given as Table 5.4.2.1-2 in 3GPP TS 38.212.
+  if (rv == 0) {
+    return 0;
+  }
+  if (rv == 1) {
+    return (17 * z - (int)bg * 4 * z) * (ncb / n);
+  }
+  if (rv == 2) {
+    return (33 * z - (int)bg * 8 * z) * (ncb / n);
+  }
+  if (rv == 3) {
+    return (56 * z - (int)bg * 13 * z) * (ncb / n);
+  }
+  return 0;
+}
+
+} // anonymous namespace
diff --git a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
index 40952c0..2324c2f 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_matching.cpp
@@ -1,9 +1,10 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
+#include "ldpc_rate_common.hpp"
 #include "utils/allocators.hpp"
 #include <cassert>
 #include <cmath>
@@ -21,13 +22,17 @@ void copy_bits(uint32_t src_bit, uint32_t start_idx, uint32_t len, uint32_t l,
   }
 }
 
-static void bit_selection(uint32_t z, uint32_t n, uint32_t e,
-                          uint32_t len_filler_bits, uint32_t k, uint32_t k0,
-                          const uint8_t *in, uint8_t *out,
-                          uint8_t *scratch_buf1, uint8_t *scratch_buf2) {
+void bit_selection(uint32_t z, uint32_t n, uint32_t e, uint32_t len_filler_bits,
+                   uint32_t k, uint32_t k0, const uint8_t *in, uint8_t *out,
+                   uint8_t *scratch_buf1, uint8_t *scratch_buf2) {
+  assert(n > 0);
+  assert(e > 0);
+  assert(k0 < n);
+  assert(n % 2 == 0);
+
   const uint8_t *in_bits = in;
   // bit selection as specified by section 5.4.2.1 in 3GPP TS 38.212
-  // remove Filler bits
+  // remove filler bits
   if (len_filler_bits > 0) {
 
     uint32_t len_s_f_bits = k - z * 2; // length of systematic & filler bits
@@ -69,10 +74,12 @@ static void bit_selection(uint32_t z, uint32_t n, uint32_t e,
   }
 }
 
-static void bit_interleave(uint32_t e, uint32_t qm, const uint8_t *in,
-                           uint8_t *out) {
+void bit_interleave(uint32_t e, uint32_t qm, const uint8_t *in, uint8_t *out) {
   // performs the bit interleaving step of LDPC encoding, as specified in
   // section 5.4.2.2 of 3GPP TS 38.212.
+
+  assert(e % qm == 0);
+
   memset((void *)out, 0, (e + 7) / 8);
 
   // transpose
@@ -88,25 +95,6 @@ static void bit_interleave(uint32_t e, uint32_t qm, const uint8_t *in,
   }
 }
 
-static int starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
-                             uint32_t ncb, uint32_t z) {
-  // Starting position k0 of different redundancy versions
-  // given as Table 5.4.2.1-2 in 3GPP TS 38.212.
-  if (rv == 0) {
-    return 0;
-  }
-  if (rv == 1) {
-    return (17 * z - (int)bg * 4 * z) * (ncb / n);
-  }
-  if (rv == 2) {
-    return (33 * z - (int)bg * 8 * z) * (ncb / n);
-  }
-  if (rv == 3) {
-    return (56 * z - (int)bg * 13 * z) * (ncb / n);
-  }
-  return 0;
-}
-
 template<typename Allocator>
 armral_status rate_matching(armral_ldpc_graph_t bg, uint32_t z, uint32_t e,
                             uint32_t nref, uint32_t len_filler_bits, uint32_t k,
diff --git a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
index c46165e..6fa9b6c 100644
--- a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
+++ b/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp
@@ -1,8 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
+#include "ldpc_rate_common.hpp"
 #include "utils/allocators.hpp"
 
 #include <cassert>
@@ -28,10 +29,10 @@ void undo_selection(uint32_t z, uint32_t n, uint32_t e,
   // performs the inverse of the bit selection as specified by
   // section 5.4.2.1 in 3GPP TS 38.212
 
-  assert(k0 >= 0 && k0 < n);
+  assert(k0 < n);
   assert(e > 0);
 
-  //  systematic bits len
+  // systematic bits len
   uint32_t len_s_bits = k - len_filler_bits - (2 * z);
   uint32_t k_idx = 0;
   uint32_t k0_start = k0;
@@ -97,6 +98,7 @@ void undo_selection(uint32_t z, uint32_t n, uint32_t e,
 void undo_interleave(uint32_t e, uint32_t qm, const int8_t *in, int8_t *out) {
   // performs the inverse of the bit interleaving step of LDPC encoding,
   // as specified in section 5.4.2.2 of 3GPP TS 38.212.
+
   assert(e > qm);
   assert(qm > 0);
   assert(e % qm == 0);
@@ -111,26 +113,6 @@ void undo_interleave(uint32_t e, uint32_t qm, const int8_t *in, int8_t *out) {
   }
 }
 
-static int starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
-                             uint32_t ncb, uint32_t z) {
-  // Duplicate of routine of the same name in ldpc_rate_matching.cpp
-  // Starting position k0 of different redundancy versions
-  // given as Table 5.4.2.1-2 in 3GPP TS 38.212.
-  if (rv == 0) {
-    return 0;
-  }
-  if (rv == 1) {
-    return (17 * z - (int)bg * 4 * z) * (ncb / n);
-  }
-  if (rv == 2) {
-    return (33 * z - (int)bg * 8 * z) * (ncb / n);
-  }
-  if (rv == 3) {
-    return (56 * z - (int)bg * 13 * z) * (ncb / n);
-  }
-  return 0;
-}
-
 template<typename Allocator>
 armral_status rate_recovery(armral_ldpc_graph_t bg, uint32_t z, uint32_t e,
                             uint32_t nref, uint32_t len_filler_bits, uint32_t k,
diff --git a/src/UpperPHY/Modulation/arm_modulation.c b/src/UpperPHY/Modulation/arm_modulation.c
index ab58bae..96c91bb 100644
--- a/src/UpperPHY/Modulation/arm_modulation.c
+++ b/src/UpperPHY/Modulation/arm_modulation.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
index 73f2fbd..3925063 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_crc_check.cpp b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
index 77d928d..f885073 100644
--- a/src/UpperPHY/Polar/arm_polar_crc_check.cpp
+++ b/src/UpperPHY/Polar/arm_polar_crc_check.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.cpp b/src/UpperPHY/Polar/arm_polar_decoder.cpp
index 2cf77b6..4c4cc8d 100644
--- a/src/UpperPHY/Polar/arm_polar_decoder.cpp
+++ b/src/UpperPHY/Polar/arm_polar_decoder.cpp
@@ -1,336 +1,37 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
 
 #include <algorithm>
-#include <array>
-#include <assert.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
+#include <cassert>
+#include <cstring>
 
-namespace {
-
-static inline uint8x16_t vld_histq_l8(const uint8_t *hist) {
-  return vreinterpretq_u8_u64(vld1q_dup_u64((const uint64_t *)hist));
-}
-
-static inline uint8x16_t vld_histq_l4(const uint8_t *hist) {
-  return vreinterpretq_u8_u32(vld1q_dup_u32((const uint32_t *)hist));
-}
-
-static inline uint8x16_t vld_histq_l2(const uint8_t *hist) {
-  return vreinterpretq_u8_u16(vld1q_dup_u16((const uint16_t *)hist));
-}
+#include "arm_polar_decoder_neon.hpp"
 
-static inline uint8x8_t vld_hist_l4(const uint8_t *hist) {
-  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)hist));
-}
-
-static inline uint8x8_t vld_hist_l2(const uint8_t *hist) {
-  return vreinterpret_u8_u16(vld1_dup_u16((const uint16_t *)hist));
-}
-
-static inline uint32x4_t vmlsl_u32_s16(uint32x4_t acc, uint16x4_t x,
-                                       uint16x4_t y) {
-  return vreinterpretq_u32_s32(vmlsl_s16(vreinterpretq_s32_u32(acc),
-                                         vreinterpret_s16_u16(x),
-                                         vreinterpret_s16_u16(y)));
-}
+namespace {
 
 typedef struct {
   const uint8_t *frozen_bits_mask;
   uint32_t curr_bit_idx;
 } sc_decoder;
 
-// Extract the sign of an integer
-static inline int8_t __attribute__((always_inline)) sign(int32_t x) {
-  return static_cast<int>(x > 0) - static_cast<int>(x < 0);
-}
-
-// calculate the minimum absolute value between two integers
-static inline int16_t __attribute__((always_inline))
-min(const int8_t x, const int8_t y) {
-  return abs(x) < abs(y) ? abs(x) : abs(y);
-}
-
-template<int L, int N>
-static inline void zip1_l(const int8_t *__restrict in, int8_t *__restrict out) {
-  static_assert(N % 16 == 0);
-  if constexpr (L == 8) {
-    for (int i = 0; i < N; i += 2) {
-      int8x16_t x1 = vld1q_dup_s8(in);
-      int8x16_t x2 = vld1q_dup_s8(in + 1);
-      vst1q_s8(out, vextq_s8(x1, x2, 8));
-      in += 2;
-      out += 16;
-    }
-  } else if constexpr (L == 4) {
-    for (int i = 0; i < N; i += 4) {
-      int8x8_t x0 = vld1s_s8(in);
-      int8x16_t x1 = vzip1l_s8(x0, x0);
-      int8x16_t x2 = vzip1q_s8(x1, x1);
-      vst1q_s8(out, x2);
-      in += 4;
-      out += 16;
-    }
-  } else if constexpr (L == 2) {
-    for (int i = 0; i < N; i += 8) {
-      int8x8_t x0 = vld1_s8(in);
-      int8x16_t x1 = vzip1l_s8(x0, x0);
-      vst1q_s8(out, x1);
-      in += 8;
-      out += 16;
-    }
-  } else {
-    assert(false && "unimplemented!");
-  }
-}
-
-// calculate beliefs for left children in SCL algorithm
-template<int length>
-static inline void f(const int8_t *r1, const int8_t *r2, int8_t *output) {
-  int16_t l = length >> 4;
-  while (l > 0) {
-    int8x16_t llr1 = vld1q_s8(r1);
-    int8x16_t llr2 = vld1q_s8(r2);
-    uint8x16_t sign_vect = vcltzq_s8(veorq_s8(llr1, llr2));
-    llr1 = vqabsq_s8(llr1);
-    llr2 = vqabsq_s8(llr2);
-    int8x16_t result = vminq_s8(llr1, llr2);
-    int8x16_t result_neg = vnegq_s8(result);
-    result = vbslq_s8(sign_vect, result_neg, result);
-    vst1q_s8(output, result);
-    l--;
-    r1 += 16;
-    r2 += 16;
-    output += 16;
-  }
-
-  if ((length >> 3) & 1) {
-    int8x8_t llr1 = vld1_s8(r1);
-    int8x8_t llr2 = vld1_s8(r2);
-    uint8x8_t sign_vect = vcltz_s8(veor_s8(llr1, llr2));
-    llr1 = vqabs_s8(llr1);
-    llr2 = vqabs_s8(llr2);
-    int8x8_t result = vmin_s8(llr1, llr2);
-    int8x8_t result_neg = vneg_s8(result);
-    result = vbsl_s8(sign_vect, result_neg, result);
-    vst1_s8(output, result);
-    r1 += 8;
-    r2 += 8;
-    output += 8;
-  }
-
-  l = length & 0x7;
-  while (l > 0) {
-    int8_t a = *r1++;
-    int8_t b = *r2++;
-    *output++ = sat_8(sign(a * b) * min(a, b));
-    l--;
-  }
+inline void __attribute__((always_inline))
+setup_sc_decoder(sc_decoder *decoder, const uint8_t *frozen) {
+  decoder->curr_bit_idx = 0;
+  decoder->frozen_bits_mask = frozen;
 }
 
 template<int Nhalf, int L>
-static inline void f_l(const int8_t *in, int8_t *out) {
+inline void f_l(const int8_t *in, int8_t *out) {
   f<Nhalf * L>(in, &in[Nhalf * L], out);
 }
 
-template<int length>
-static inline void g(const int8_t *r1, const int8_t *r2, const uint8_t *dec,
-                     int8_t *output) {
-  // Calculate beliefs for right children in the successive cancellation (SC)
-  // algorithm:
-  // g(a, b, c=0) = a + b
-  // g(a, b, c=1) = a - b
-  int16_t l = length >> 4;
-  while (l > 0) {
-    int8x16_t llr1 = vld1q_s8(r1);
-    int8x16_t llr2 = vld1q_s8(r2);
-    uint8x16_t bit = vld1q_u8(dec);
-    int8x16_t result =
-        vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
-    vst1q_s8(output, result);
-    l--;
-    r1 += 16;
-    r2 += 16;
-    dec += 16;
-    output += 16;
-  }
-
-  if ((length >> 3) & 1) {
-    int8x8_t llr1 = vld1_s8(r1);
-    int8x8_t llr2 = vld1_s8(r2);
-    uint8x8_t bit = vld1_u8(dec);
-    int8x8_t result =
-        vbsl_s8(vceqz_u8(bit), vqadd_s8(llr2, llr1), vqsub_s8(llr2, llr1));
-    vst1_s8(output, result);
-    r1 += 8;
-    r2 += 8;
-    dec += 8;
-    output += 8;
-  }
-
-  l = length & 0x7;
-  while (l > 0) {
-    int8_t a = *r1++;
-    int8_t b = *r2++;
-    int8_t c = *dec++;
-    *output++ = sat_8((int16_t)(b + (1 - 2 * c) * a));
-    l--;
-  }
-}
-
-template<int Nhalf, int L, typename = void>
-struct g_l_impl {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    for (int i = 0; i < Nhalf; ++i) {
-      for (int j = 0; j < L; ++j) {
-        uint8_t h = L > 1 ? hist[j] : 0;
-        int8_t a = in[i * L + h];
-        int8_t b = in[(i + Nhalf) * L + h];
-        uint8_t c = dec[i * L + j];
-        out[i * L + j] = sat_8((int16_t)(b + (1 - 2 * c) * a));
-      }
-    }
-  }
-};
-
-static inline void g_l_x8(const int8_t *in, const uint8_t *dec,
-                          const uint8x8_t h8, uint8x8_t xs_idx, int8_t *out) {
-  xs_idx += h8;
-  int8x8_t as = vld1_s8(in);
-  int8x8_t bs = vld1_s8(&in[8]);
-
-  int8x8_t llr1 = vtbl1_s8(as, vreinterpret_s8_u8(xs_idx));
-  int8x8_t llr2 = vtbl1_s8(bs, vreinterpret_s8_u8(xs_idx));
-
-  uint8x8_t bit = vld1_u8(dec);
-
-  int8x8_t result =
-      vbsl_s8(vceqz_u8(bit), vqadd_s8(llr2, llr1), vqsub_s8(llr2, llr1));
-  vst1_s8(out, result);
-}
-
-static inline void g_l_x16(const int8_t *in, const uint8_t *dec,
-                           const uint8x16_t h8, uint8x16_t xs_idx,
-                           int8_t *out) {
-  xs_idx += h8;
-  int8x16_t as = vld1q_s8(in);
-  int8x16_t bs = vld1q_s8(&in[16]);
-
-  int8x16_t llr1 = vqtbl1q_s8(as, xs_idx);
-  int8x16_t llr2 = vqtbl1q_s8(bs, xs_idx);
-
-  uint8x16_t bit = vld1q_u8(dec);
-
-  int8x16_t result =
-      vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
-  vst1q_s8(out, result);
-}
-
-template<int Nhalf, int L, int Max_Count>
-static inline void g_l_x16_loop(const int8_t *in, const uint8_t *dec,
-                                const uint8x16_t h8, uint8x16_t xs_idx,
-                                int8_t *out) {
-  xs_idx += h8;
-  for (int i = 0; i < Nhalf; i += Max_Count) {
-    int8x16_t as = vld1q_s8(&in[i * L]);
-    int8x16_t bs = vld1q_s8(&in[(i + Nhalf) * L]);
-
-    int8x16_t llr1 = vqtbl1q_s8(as, xs_idx);
-    int8x16_t llr2 = vqtbl1q_s8(bs, xs_idx);
-
-    uint8x16_t bit = vld1q_u8(&dec[i * L]);
-
-    int8x16_t result =
-        vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
-    vst1q_s8(out, result);
-    out += 16;
-  }
-}
-
-template<int Nhalf>
-struct g_l_impl<Nhalf, 8, std::enable_if_t<(Nhalf > 2)>> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    uint8x16_t h8 = vld_histq_l8(hist);
-    uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-    g_l_x16_loop<Nhalf, 8, 2>(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<int Nhalf>
-struct g_l_impl<Nhalf, 4, std::enable_if_t<(Nhalf > 4)>> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    uint8x16_t h8 = vld_histq_l4(hist);
-    uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-    g_l_x16_loop<Nhalf, 4, 4>(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<int Nhalf>
-struct g_l_impl<Nhalf, 2, std::enable_if_t<(Nhalf >= 8)>> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    uint8x16_t h8 = vld_histq_l2(hist);
-    uint8x16_t xs_idx = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
-    g_l_x16_loop<Nhalf, 2, 8>(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<>
-struct g_l_impl<2, 4> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=4).
-    uint8x8_t h8 = vld_hist_l4(hist);
-    uint8x8_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4};
-    g_l_x8(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<>
-struct g_l_impl<2, 8> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    // specialised N=2-byte chunks interleaved (times L=8).
-    uint8x16_t h8 = vld_histq_l8(hist);
-    uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-    g_l_x16(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<>
-struct g_l_impl<4, 2> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    uint8x8_t h8 = vld_hist_l2(hist);
-    uint8x8_t xs_idx = {0, 0, 2, 2, 4, 4, 6, 6};
-    g_l_x8(in, dec, h8, xs_idx, out);
-  }
-};
-
-template<>
-struct g_l_impl<4, 4> {
-  static inline void g_l(const int8_t *in, const uint8_t *dec,
-                         const uint8_t *hist, int8_t *out) {
-    // specialised N=4-byte chunks interleaved (times L=4).
-    uint8x16_t h8 = vld_histq_l4(hist);
-    uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-    g_l_x16(in, dec, h8, xs_idx, out);
-  }
-};
-
 template<int Nhalf, int L>
-static inline void g_l(const int8_t *in, const uint8_t *dec,
-                       const uint8_t *hist, int8_t *out) {
+inline void g_l(const int8_t *in, const uint8_t *dec, const uint8_t *hist,
+                int8_t *out) {
   // Calculate beliefs for right children in the successive cancellation list
   // (SCL) algorithm:
   // g(a_h, b_h, c_i=0) = a_h + b_h
@@ -346,7 +47,7 @@ static inline void g_l(const int8_t *in, const uint8_t *dec,
 }
 
 template<int Nhalf, int L>
-static inline void g_top(const int8_t *in, const uint8_t *dec, int8_t *out) {
+inline void g_top(const int8_t *in, const uint8_t *dec, int8_t *out) {
   // no history here, since no differing beliefs to choose from.
   static_assert(Nhalf >= 16);
   static_assert(Nhalf % 4 == 0);
@@ -393,261 +94,39 @@ static inline void g_top(const int8_t *in, const uint8_t *dec, int8_t *out) {
   }
 }
 
-template<int Nhalf, int L>
-static inline void __attribute__((always_inline))
-combine_l(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
-          const uint8_t *hist) {
-  static_assert(Nhalf >= 2);
+template<int L, int N>
+inline void zip1_l(const int8_t *__restrict in, int8_t *__restrict out) {
+  static_assert(N % 16 == 0);
   if constexpr (L == 8) {
-    uint8x16_t h8 = vld_histq_l8(hist);
-    uint8x16_t x0_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-    x0_idx += h8;
-    for (int i = 0; i < Nhalf; i += 2) {
-      uint8x16_t x0 = vld1q_u8(dec1);
-      uint8x16_t x1 = vld1q_u8(dec2);
-      x0 = vqtbl1q_u8(x0, x0_idx);
-      vst1q_u8(output, x0 ^ x1);
-      vst1q_u8(&output[Nhalf * L], x1);
-      dec1 += 16;
-      dec2 += 16;
-      output += 16;
-    }
-  } else if constexpr (L == 4 && Nhalf % 4 == 0) {
-    uint8x16_t h8 = vld_histq_l4(hist);
-    uint8x16_t x0_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-    x0_idx += h8;
-    for (int i = 0; i < Nhalf; i += 4) {
-      uint8x16_t x0 = vld1q_u8(dec1);
-      uint8x16_t x1 = vld1q_u8(dec2);
-      x0 = vqtbl1q_u8(x0, x0_idx);
-      vst1q_u8(output, x0 ^ x1);
-      vst1q_u8(&output[Nhalf * L], x1);
-      dec1 += 16;
-      dec2 += 16;
-      output += 16;
+    for (int i = 0; i < N; i += 2) {
+      int8x16_t x1 = vld1q_dup_s8(in);
+      int8x16_t x2 = vld1q_dup_s8(in + 1);
+      vst1q_s8(out, vextq_s8(x1, x2, 8));
+      in += 2;
+      out += 16;
     }
-  } else if constexpr (L == 2 && Nhalf % 8 == 0) {
-    uint8x16_t h8 = vld_histq_l2(hist);
-    uint8x16_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
-    x0_idx += h8;
-    for (int i = 0; i < Nhalf; i += 8) {
-      uint8x16_t x0 = vld1q_u8(dec1);
-      uint8x16_t x1 = vld1q_u8(dec2);
-      x0 = vqtbl1q_u8(x0, x0_idx);
-      vst1q_u8(output, x0 ^ x1);
-      vst1q_u8(&output[Nhalf * L], x1);
-      dec1 += 16;
-      dec2 += 16;
-      output += 16;
+  } else if constexpr (L == 4) {
+    for (int i = 0; i < N; i += 4) {
+      int8x8_t x0 = vld1s_s8(in);
+      int8x16_t x1 = vzip1l_s8(x0, x0);
+      int8x16_t x2 = vzip1q_s8(x1, x1);
+      vst1q_s8(out, x2);
+      in += 4;
+      out += 16;
     }
-  } else if constexpr (L == 1) {
-    for (int i = 0; i < Nhalf; ++i) {
-      output[i] = dec1[i] ^ dec2[i];
-      output[Nhalf + i] = dec2[i];
+  } else if constexpr (L == 2) {
+    for (int i = 0; i < N; i += 8) {
+      int8x8_t x0 = vld1_s8(in);
+      int8x16_t x1 = vzip1l_s8(x0, x0);
+      vst1q_s8(out, x1);
+      in += 8;
+      out += 16;
     }
   } else {
-    uint8_t x0[Nhalf * L];
-    uint8_t x1[Nhalf * L];
-    for (int j = 0; j < Nhalf; ++j) {
-      for (int i = 0; i < L; ++i) {
-        uint8_t h = L > 1 ? hist[i] : 0;
-        x0[L * j + i] = dec1[L * j + h];
-        x1[L * j + i] = dec2[L * j + i];
-      }
-    }
-    for (int i = 0; i < L * Nhalf; ++i) {
-      output[i] = x0[i] ^ x1[i];
-      output[L * Nhalf + i] = x1[i];
-    }
-  }
-}
-
-template<>
-inline void __attribute__((always_inline))
-combine_l<2, 2>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
-                const uint8_t *hist) {
-  uint8x8_t h8 = vld_hist_l2(hist);
-  uint8x8_t x0 = vld1s_u8(dec1);
-  uint8x8_t x1 = vld1s_u8(dec2);
-
-  uint8x8_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6};
-  x0_idx += h8;
-  x0 = vtbl1_u8(x0, x0_idx);
-
-  *(uint32_t *)output = vreinterpret_u32_u8(x0 ^ x1)[0];
-  output += 4;
-  *(uint32_t *)output = vreinterpret_u32_u8(x1)[0];
-}
-
-template<>
-inline void __attribute__((always_inline))
-combine_l<2, 4>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
-                const uint8_t *hist) {
-  uint8x8_t h8 = vld_hist_l4(hist);
-  uint8x8_t x0 = vld1_u8(dec1);
-  uint8x8_t x1 = vld1_u8(dec2);
-
-  uint8x8_t x0_idx = {0, 0, 0, 0, 4, 4, 4, 4};
-  x0_idx += h8;
-  x0 = vtbl1_u8(x0, x0_idx);
-
-  vst1_u8(output, x0 ^ x1);
-  vst1_u8(&output[8], x1);
-}
-
-template<>
-inline void __attribute__((always_inline))
-combine_l<4, 2>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
-                const uint8_t *hist) {
-  uint8x8_t h8 = vld_hist_l2(hist);
-  uint8x8_t x0 = vld1_u8(dec1);
-  uint8x8_t x1 = vld1_u8(dec2);
-
-  uint8x8_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6};
-  x0_idx += h8;
-  x0 = vtbl1_u8(x0, x0_idx);
-
-  vst1_u8(output, x0 ^ x1);
-  vst1_u8(&output[8], x1);
-}
-
-template<int L>
-static inline void combine_hist(const uint8_t *hist1, const uint8_t *hist2,
-                                uint8_t *hist) {
-  for (int i = 0; i < L; ++i) {
-    hist[i] = hist1[hist2[i]];
-  }
-}
-
-template<>
-inline void combine_hist<8>(const uint8_t *hist1, const uint8_t *hist2,
-                            uint8_t *hist) {
-  uint8x8_t h1 = vld1_u8(hist1);
-  uint8x8_t h2 = vld1_u8(hist2);
-  uint8x8_t h = vtbl1_u8(h1, h2);
-  vst1_u8(hist, h);
-}
-
-template<>
-inline void combine_hist<4>(const uint8_t *hist1, const uint8_t *hist2,
-                            uint8_t *hist) {
-  uint8x8_t h1 = vld1s_u8(hist1);
-  uint8x8_t h2 = vld1s_u8(hist2);
-  uint8x8_t h = vtbl1_u8(h1, h2);
-  *(uint32_t *)hist = vreinterpret_u32_u8(h)[0];
-}
-
-template<>
-inline void combine_hist<2>(const uint8_t *hist1, const uint8_t *hist2,
-                            uint8_t *hist) {
-  uint8x8_t h1 = vld1h_u8(hist1);
-  uint8x8_t h2 = vld1h_u8(hist2);
-  uint8x8_t h = vtbl1_u8(h1, h2);
-  *(uint16_t *)hist = vreinterpret_u16_u8(h)[0];
-}
-
-template<>
-inline void combine_hist<1>(const uint8_t * /*hist1*/,
-                            const uint8_t * /*hist2*/, uint8_t * /*hist*/) {
-  // nothing to do if L=1, only one choice of history.
-}
-
-template<int N, int L>
-static inline void combine_seq_out(const uint8_t *seq1, const uint8_t *seq2,
-                                   const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  for (int i = 0; i < L; ++i) {
-    uint8_t h = L > 1 ? hist2[i] : 0;
-    memcpy((void *)&p_u_seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
-    memcpy((void *)&p_u_seq_out[i * N + N / 2], (const void *)&seq2[i * N / 2],
-           N / 2);
+    assert(false && "unimplemented!");
   }
 }
 
-template<>
-inline void combine_seq_out<2, 2>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint8x8_t h = vld1h_u8(hist2);
-  uint8x8_t s1 = vtbl1_u8(vld1h_u8(seq1), h);
-  uint8x8_t s2 = vld1h_u8(seq2);
-  *(uint32_t *)p_u_seq_out = vreinterpret_u32_u8(vzip1_u8(s1, s2))[0];
-}
-
-template<>
-inline void combine_seq_out<2, 4>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint8x8_t h = vld1s_u8(hist2);
-  uint8x8_t s1 = vtbl1_u8(vld1s_u8(seq1), h);
-  uint8x8_t s2 = vld1s_u8(seq2);
-  vst1_u8(p_u_seq_out, vzip1_u8(s1, s2));
-}
-
-template<>
-inline void combine_seq_out<2, 8>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint8x8_t h = vld1_u8(hist2);
-  uint8x8_t s1 = vtbl1_u8(vld1_u8(seq1), h);
-  uint8x8_t s2 = vld1_u8(seq2);
-  vst1q_u8(p_u_seq_out, vzip1l_u8(s1, s2));
-}
-
-template<>
-inline void combine_seq_out<4, 2>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint16x4_t in1 = vld1s_u16((const uint16_t *)seq1);
-  uint16x4_t in2 = vld1s_u16((const uint16_t *)seq2);
-
-  uint8x8_t h = vld1h_u8(hist2);
-  h = vzip1_u8(h, h);
-  uint8x8_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1};
-  h_ofs0 = vsli_n_u8(h_ofs0, h, 1);
-
-  in1 = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(in1), h_ofs0));
-
-  vst1_u16((uint16_t *)p_u_seq_out, vzip1_u16(in1, in2));
-}
-
-template<>
-inline void combine_seq_out<4, 4>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint16x4_t in1 = vld1_u16((const uint16_t *)seq1);
-  uint16x4_t in2 = vld1_u16((const uint16_t *)seq2);
-
-  uint8x8_t h = vld1s_u8(hist2);
-  h = vzip1_u8(h, h);
-  uint8x8_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1};
-  h_ofs0 = vsli_n_u8(h_ofs0, h, 1);
-
-  in1 = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(in1), h_ofs0));
-
-  vst1q_u16((uint16_t *)p_u_seq_out, vzip1l_u16(in1, in2));
-}
-
-template<>
-inline void combine_seq_out<4, 8>(const uint8_t *seq1, const uint8_t *seq2,
-                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
-  uint16x8_t in1 = vld1q_u16((const uint16_t *)seq1);
-  uint16x8_t in2 = vld1q_u16((const uint16_t *)seq2);
-
-  uint8x16_t h = vcombine_u8(vld1_u8(hist2), vdup_n_u8(0));
-  h = vzip1q_u8(h, h);
-  uint8x16_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
-  h_ofs0 = vsliq_n_u8(h_ofs0, h, 1);
-
-  in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), h_ofs0));
-
-  vst1q_u16((uint16_t *)p_u_seq_out, vzip1q_u16(in1, in2));
-  vst1q_u16((uint16_t *)p_u_seq_out + 8, vzip2q_u16(in1, in2));
-}
-
-static inline void __attribute__((always_inline))
-setup_sc_decoder(sc_decoder *decoder, const uint8_t *frozen) {
-  decoder->curr_bit_idx = 0;
-  decoder->frozen_bits_mask = frozen;
-}
-
-template<int N, int L, typename = void>
-struct polar_stage;
-
 template<int L, int K, typename = void>
 struct sort_decoder_entries_impl {
   static inline void sort(uint8_t *est_bits, uint32_t *pm, uint8_t *hist) {
@@ -678,11 +157,19 @@ struct sort_decoder_entries_impl<L, 0> {
 };
 
 template<int L>
-static void sort_decoder_entries(uint8_t *est_bits, uint32_t *pm,
-                                 uint8_t *hist) {
+void sort_decoder_entries(uint8_t *est_bits, uint32_t *pm, uint8_t *hist) {
   sort_decoder_entries_impl<L * 2, L>::sort(est_bits, pm, hist);
 }
 
+template<int N, int L, typename = void>
+struct polar_stage;
+
+inline uint32x4_t vmlsl_u32_s16(uint32x4_t acc, uint16x4_t x, uint16x4_t y) {
+  return vreinterpretq_u32_s32(vmlsl_s16(vreinterpretq_s32_u32(acc),
+                                         vreinterpret_s16_u16(x),
+                                         vreinterpret_s16_u16(y)));
+}
+
 template<int L>
 struct polar_stage<2, L> {
   static inline void __attribute__((always_inline))
@@ -1035,7 +522,7 @@ struct polar_stage<2, 8> {
   }
 };
 
-static inline uint8_t __attribute__((always_inline))
+inline uint8_t __attribute__((always_inline))
 estimate_bit(uint32_t frozen, const int32_t l_u, uint32_t idx) {
   return (frozen & 0x80) != 0U ? 0 : static_cast<int>(l_u < 0);
 }
diff --git a/src/UpperPHY/Polar/arm_polar_decoder.hpp b/src/UpperPHY/Polar/arm_polar_decoder.hpp
new file mode 100644
index 0000000..ef2091c
--- /dev/null
+++ b/src/UpperPHY/Polar/arm_polar_decoder.hpp
@@ -0,0 +1,21 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+
+#pragma once
+
+namespace {
+
+// Extract the sign of an integer
+inline int8_t __attribute__((always_inline)) sign(int32_t x) {
+  return static_cast<int>(x > 0) - static_cast<int>(x < 0);
+}
+
+// calculate the minimum absolute value between two integers
+inline int16_t __attribute__((always_inline))
+min(const int8_t x, const int8_t y) {
+  return abs(x) < abs(y) ? abs(x) : abs(y);
+}
+
+} // namespace
diff --git a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
new file mode 100644
index 0000000..249f2e0
--- /dev/null
+++ b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp
@@ -0,0 +1,507 @@
+/*
+    Arm RAN Acceleration Library
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+*/
+
+#pragma once
+#include "arm_polar_decoder.hpp"
+
+namespace {
+
+inline uint8x16_t vld_histq_l8(const uint8_t *hist) {
+  return vreinterpretq_u8_u64(vld1q_dup_u64((const uint64_t *)hist));
+}
+
+inline uint8x16_t vld_histq_l4(const uint8_t *hist) {
+  return vreinterpretq_u8_u32(vld1q_dup_u32((const uint32_t *)hist));
+}
+
+inline uint8x8_t vld_hist_l4(const uint8_t *hist) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)hist));
+}
+
+inline uint8x16_t vld_histq_l2(const uint8_t *hist) {
+  return vreinterpretq_u8_u16(vld1q_dup_u16((const uint16_t *)hist));
+}
+
+inline uint8x8_t vld_hist_l2(const uint8_t *hist) {
+  return vreinterpret_u8_u16(vld1_dup_u16((const uint16_t *)hist));
+}
+
+template<int Nhalf, int L, typename = void>
+struct g_l_impl {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    for (int i = 0; i < Nhalf; ++i) {
+      for (int j = 0; j < L; ++j) {
+        uint8_t h = L > 1 ? hist[j] : 0;
+        int8_t a = in[i * L + h];
+        int8_t b = in[(i + Nhalf) * L + h];
+        uint8_t c = dec[i * L + j];
+        out[i * L + j] = sat_8((int16_t)(b + (1 - 2 * c) * a));
+      }
+    }
+  }
+};
+
+template<int Nhalf, int L, int Max_Count>
+inline void g_l_x16_loop(const int8_t *in, const uint8_t *dec,
+                         const uint8x16_t h8, uint8x16_t xs_idx, int8_t *out) {
+  xs_idx += h8;
+  for (int i = 0; i < Nhalf; i += Max_Count) {
+    int8x16_t as = vld1q_s8(&in[i * L]);
+    int8x16_t bs = vld1q_s8(&in[(i + Nhalf) * L]);
+
+    int8x16_t llr1 = vqtbl1q_s8(as, xs_idx);
+    int8x16_t llr2 = vqtbl1q_s8(bs, xs_idx);
+
+    uint8x16_t bit = vld1q_u8(&dec[i * L]);
+
+    int8x16_t result =
+        vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
+    vst1q_s8(out, result);
+    out += 16;
+  }
+}
+
+template<int Nhalf>
+struct g_l_impl<Nhalf, 8, std::enable_if_t<(Nhalf > 2)>> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    uint8x16_t h8 = vld_histq_l8(hist);
+    uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+    g_l_x16_loop<Nhalf, 8, 2>(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<int Nhalf>
+struct g_l_impl<Nhalf, 4, std::enable_if_t<(Nhalf > 4)>> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    uint8x16_t h8 = vld_histq_l4(hist);
+    uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+    g_l_x16_loop<Nhalf, 4, 4>(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<int Nhalf>
+struct g_l_impl<Nhalf, 2, std::enable_if_t<(Nhalf >= 8)>> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    uint8x16_t h8 = vld_histq_l2(hist);
+    uint8x16_t xs_idx = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+    g_l_x16_loop<Nhalf, 2, 8>(in, dec, h8, xs_idx, out);
+  }
+};
+
+inline void g_l_x8(const int8_t *in, const uint8_t *dec, const uint8x8_t h8,
+                   uint8x8_t xs_idx, int8_t *out) {
+  xs_idx += h8;
+  int8x8_t as = vld1_s8(in);
+  int8x8_t bs = vld1_s8(&in[8]);
+
+  int8x8_t llr1 = vtbl1_s8(as, vreinterpret_s8_u8(xs_idx));
+  int8x8_t llr2 = vtbl1_s8(bs, vreinterpret_s8_u8(xs_idx));
+
+  uint8x8_t bit = vld1_u8(dec);
+
+  int8x8_t result =
+      vbsl_s8(vceqz_u8(bit), vqadd_s8(llr2, llr1), vqsub_s8(llr2, llr1));
+  vst1_s8(out, result);
+}
+
+inline void g_l_x16(const int8_t *in, const uint8_t *dec, const uint8x16_t h8,
+                    uint8x16_t xs_idx, int8_t *out) {
+  xs_idx += h8;
+  int8x16_t as = vld1q_s8(in);
+  int8x16_t bs = vld1q_s8(&in[16]);
+
+  int8x16_t llr1 = vqtbl1q_s8(as, xs_idx);
+  int8x16_t llr2 = vqtbl1q_s8(bs, xs_idx);
+
+  uint8x16_t bit = vld1q_u8(dec);
+
+  int8x16_t result =
+      vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
+  vst1q_s8(out, result);
+}
+
+template<>
+struct g_l_impl<2, 4> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    // specialised N=2-byte chunks interleaved (times L=4).
+    uint8x8_t h8 = vld_hist_l4(hist);
+    uint8x8_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4};
+    g_l_x8(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<>
+struct g_l_impl<2, 8> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    // specialised N=2-byte chunks interleaved (times L=8).
+    uint8x16_t h8 = vld_histq_l8(hist);
+    uint8x16_t xs_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+    g_l_x16(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<>
+struct g_l_impl<4, 2> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    uint8x8_t h8 = vld_hist_l2(hist);
+    uint8x8_t xs_idx = {0, 0, 2, 2, 4, 4, 6, 6};
+    g_l_x8(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<>
+struct g_l_impl<4, 4> {
+  static inline void g_l(const int8_t *in, const uint8_t *dec,
+                         const uint8_t *hist, int8_t *out) {
+    // specialised N=4-byte chunks interleaved (times L=4).
+    uint8x16_t h8 = vld_histq_l4(hist);
+    uint8x16_t xs_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+    g_l_x16(in, dec, h8, xs_idx, out);
+  }
+};
+
+template<int Nhalf, int L>
+inline void __attribute__((always_inline))
+combine_l(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
+          const uint8_t *hist) {
+  static_assert(Nhalf >= 2);
+  if constexpr (L == 8) {
+    uint8x16_t h8 = vld_histq_l8(hist);
+    uint8x16_t x0_idx = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+    x0_idx += h8;
+    for (int i = 0; i < Nhalf; i += 2) {
+      uint8x16_t x0 = vld1q_u8(dec1);
+      uint8x16_t x1 = vld1q_u8(dec2);
+      x0 = vqtbl1q_u8(x0, x0_idx);
+      vst1q_u8(output, x0 ^ x1);
+      vst1q_u8(&output[Nhalf * L], x1);
+      dec1 += 16;
+      dec2 += 16;
+      output += 16;
+    }
+  } else if constexpr (L == 4 && Nhalf % 4 == 0) {
+    uint8x16_t h8 = vld_histq_l4(hist);
+    uint8x16_t x0_idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+    x0_idx += h8;
+    for (int i = 0; i < Nhalf; i += 4) {
+      uint8x16_t x0 = vld1q_u8(dec1);
+      uint8x16_t x1 = vld1q_u8(dec2);
+      x0 = vqtbl1q_u8(x0, x0_idx);
+      vst1q_u8(output, x0 ^ x1);
+      vst1q_u8(&output[Nhalf * L], x1);
+      dec1 += 16;
+      dec2 += 16;
+      output += 16;
+    }
+  } else if constexpr (L == 2 && Nhalf % 8 == 0) {
+    uint8x16_t h8 = vld_histq_l2(hist);
+    uint8x16_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
+    x0_idx += h8;
+    for (int i = 0; i < Nhalf; i += 8) {
+      uint8x16_t x0 = vld1q_u8(dec1);
+      uint8x16_t x1 = vld1q_u8(dec2);
+      x0 = vqtbl1q_u8(x0, x0_idx);
+      vst1q_u8(output, x0 ^ x1);
+      vst1q_u8(&output[Nhalf * L], x1);
+      dec1 += 16;
+      dec2 += 16;
+      output += 16;
+    }
+  } else if constexpr (L == 1) {
+    for (int i = 0; i < Nhalf; ++i) {
+      output[i] = dec1[i] ^ dec2[i];
+      output[Nhalf + i] = dec2[i];
+    }
+  } else {
+    uint8_t x0[Nhalf * L];
+    uint8_t x1[Nhalf * L];
+    for (int j = 0; j < Nhalf; ++j) {
+      for (int i = 0; i < L; ++i) {
+        uint8_t h = L > 1 ? hist[i] : 0;
+        x0[L * j + i] = dec1[L * j + h];
+        x1[L * j + i] = dec2[L * j + i];
+      }
+    }
+    for (int i = 0; i < L * Nhalf; ++i) {
+      output[i] = x0[i] ^ x1[i];
+      output[L * Nhalf + i] = x1[i];
+    }
+  }
+}
+
+template<>
+inline void __attribute__((always_inline))
+combine_l<2, 2>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
+                const uint8_t *hist) {
+  uint8x8_t h8 = vld_hist_l2(hist);
+  uint8x8_t x0 = vld1s_u8(dec1);
+  uint8x8_t x1 = vld1s_u8(dec2);
+
+  uint8x8_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6};
+  x0_idx += h8;
+  x0 = vtbl1_u8(x0, x0_idx);
+
+  *(uint32_t *)output = vreinterpret_u32_u8(x0 ^ x1)[0];
+  output += 4;
+  *(uint32_t *)output = vreinterpret_u32_u8(x1)[0];
+}
+
+template<>
+inline void __attribute__((always_inline))
+combine_l<2, 4>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
+                const uint8_t *hist) {
+  uint8x8_t h8 = vld_hist_l4(hist);
+  uint8x8_t x0 = vld1_u8(dec1);
+  uint8x8_t x1 = vld1_u8(dec2);
+
+  uint8x8_t x0_idx = {0, 0, 0, 0, 4, 4, 4, 4};
+  x0_idx += h8;
+  x0 = vtbl1_u8(x0, x0_idx);
+
+  vst1_u8(output, x0 ^ x1);
+  vst1_u8(&output[8], x1);
+}
+
+template<>
+inline void __attribute__((always_inline))
+combine_l<4, 2>(const uint8_t *dec1, const uint8_t *dec2, uint8_t *output,
+                const uint8_t *hist) {
+  uint8x8_t h8 = vld_hist_l2(hist);
+  uint8x8_t x0 = vld1_u8(dec1);
+  uint8x8_t x1 = vld1_u8(dec2);
+
+  uint8x8_t x0_idx = {0, 0, 2, 2, 4, 4, 6, 6};
+  x0_idx += h8;
+  x0 = vtbl1_u8(x0, x0_idx);
+
+  vst1_u8(output, x0 ^ x1);
+  vst1_u8(&output[8], x1);
+}
+
+template<int N, int L>
+inline void combine_seq_out(const uint8_t *seq1, const uint8_t *seq2,
+                            const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  for (int i = 0; i < L; ++i) {
+    uint8_t h = L > 1 ? hist2[i] : 0;
+    memcpy((void *)&p_u_seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
+    memcpy((void *)&p_u_seq_out[i * N + N / 2], (const void *)&seq2[i * N / 2],
+           N / 2);
+  }
+}
+
+template<>
+inline void combine_seq_out<2, 2>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint8x8_t h = vld1h_u8(hist2);
+  uint8x8_t s1 = vtbl1_u8(vld1h_u8(seq1), h);
+  uint8x8_t s2 = vld1h_u8(seq2);
+  *(uint32_t *)p_u_seq_out = vreinterpret_u32_u8(vzip1_u8(s1, s2))[0];
+}
+
+template<>
+inline void combine_seq_out<2, 4>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint8x8_t h = vld1s_u8(hist2);
+  uint8x8_t s1 = vtbl1_u8(vld1s_u8(seq1), h);
+  uint8x8_t s2 = vld1s_u8(seq2);
+  vst1_u8(p_u_seq_out, vzip1_u8(s1, s2));
+}
+
+template<>
+inline void combine_seq_out<2, 8>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint8x8_t h = vld1_u8(hist2);
+  uint8x8_t s1 = vtbl1_u8(vld1_u8(seq1), h);
+  uint8x8_t s2 = vld1_u8(seq2);
+  vst1q_u8(p_u_seq_out, vzip1l_u8(s1, s2));
+}
+
+template<>
+inline void combine_seq_out<4, 2>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint16x4_t in1 = vld1s_u16((const uint16_t *)seq1);
+  uint16x4_t in2 = vld1s_u16((const uint16_t *)seq2);
+
+  uint8x8_t h = vld1h_u8(hist2);
+  h = vzip1_u8(h, h);
+  uint8x8_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1};
+  h_ofs0 = vsli_n_u8(h_ofs0, h, 1);
+
+  in1 = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(in1), h_ofs0));
+
+  vst1_u16((uint16_t *)p_u_seq_out, vzip1_u16(in1, in2));
+}
+
+template<>
+inline void combine_seq_out<4, 4>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint16x4_t in1 = vld1_u16((const uint16_t *)seq1);
+  uint16x4_t in2 = vld1_u16((const uint16_t *)seq2);
+
+  uint8x8_t h = vld1s_u8(hist2);
+  h = vzip1_u8(h, h);
+  uint8x8_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1};
+  h_ofs0 = vsli_n_u8(h_ofs0, h, 1);
+
+  in1 = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(in1), h_ofs0));
+
+  vst1q_u16((uint16_t *)p_u_seq_out, vzip1l_u16(in1, in2));
+}
+
+template<>
+inline void combine_seq_out<4, 8>(const uint8_t *seq1, const uint8_t *seq2,
+                                  const uint8_t *hist2, uint8_t *p_u_seq_out) {
+  uint16x8_t in1 = vld1q_u16((const uint16_t *)seq1);
+  uint16x8_t in2 = vld1q_u16((const uint16_t *)seq2);
+
+  uint8x16_t h = vcombine_u8(vld1_u8(hist2), vdup_n_u8(0));
+  h = vzip1q_u8(h, h);
+  uint8x16_t h_ofs0 = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+  h_ofs0 = vsliq_n_u8(h_ofs0, h, 1);
+
+  in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), h_ofs0));
+
+  vst1q_u16((uint16_t *)p_u_seq_out, vzip1q_u16(in1, in2));
+  vst1q_u16((uint16_t *)p_u_seq_out + 8, vzip2q_u16(in1, in2));
+}
+
+template<int Length>
+inline void g(const int8_t *r1, const int8_t *r2, const uint8_t *dec,
+              int8_t *output) {
+  // Calculate beliefs for right children in the successive cancellation (SC)
+  // algorithm:
+  // g(a, b, c=0) = a + b
+  // g(a, b, c=1) = a - b
+  int16_t l = Length >> 4;
+  while (l > 0) {
+    int8x16_t llr1 = vld1q_s8(r1);
+    int8x16_t llr2 = vld1q_s8(r2);
+    uint8x16_t bit = vld1q_u8(dec);
+    int8x16_t result =
+        vbslq_s8(vceqzq_u8(bit), vqaddq_s8(llr2, llr1), vqsubq_s8(llr2, llr1));
+    vst1q_s8(output, result);
+    l--;
+    r1 += 16;
+    r2 += 16;
+    dec += 16;
+    output += 16;
+  }
+
+  if ((Length >> 3) & 1) {
+    int8x8_t llr1 = vld1_s8(r1);
+    int8x8_t llr2 = vld1_s8(r2);
+    uint8x8_t bit = vld1_u8(dec);
+    int8x8_t result =
+        vbsl_s8(vceqz_u8(bit), vqadd_s8(llr2, llr1), vqsub_s8(llr2, llr1));
+    vst1_s8(output, result);
+    r1 += 8;
+    r2 += 8;
+    dec += 8;
+    output += 8;
+  }
+
+  l = Length & 0x7;
+  while (l > 0) {
+    int8_t a = *r1++;
+    int8_t b = *r2++;
+    int8_t c = *dec++;
+    *output++ = sat_8((int16_t)(b + (1 - 2 * c) * a));
+    l--;
+  }
+}
+
+// calculate beliefs for left children in SCL algorithm
+template<int Length>
+inline void f(const int8_t *r1, const int8_t *r2, int8_t *output) {
+  int16_t l = Length >> 4;
+  while (l > 0) {
+    int8x16_t llr1 = vld1q_s8(r1);
+    int8x16_t llr2 = vld1q_s8(r2);
+    uint8x16_t sign_vect = vcltzq_s8(veorq_s8(llr1, llr2));
+    llr1 = vqabsq_s8(llr1);
+    llr2 = vqabsq_s8(llr2);
+    int8x16_t result = vminq_s8(llr1, llr2);
+    int8x16_t result_neg = vnegq_s8(result);
+    result = vbslq_s8(sign_vect, result_neg, result);
+    vst1q_s8(output, result);
+    l--;
+    r1 += 16;
+    r2 += 16;
+    output += 16;
+  }
+
+  if ((Length >> 3) & 1) {
+    int8x8_t llr1 = vld1_s8(r1);
+    int8x8_t llr2 = vld1_s8(r2);
+    uint8x8_t sign_vect = vcltz_s8(veor_s8(llr1, llr2));
+    llr1 = vqabs_s8(llr1);
+    llr2 = vqabs_s8(llr2);
+    int8x8_t result = vmin_s8(llr1, llr2);
+    int8x8_t result_neg = vneg_s8(result);
+    result = vbsl_s8(sign_vect, result_neg, result);
+    vst1_s8(output, result);
+    r1 += 8;
+    r2 += 8;
+    output += 8;
+  }
+
+  l = Length & 0x7;
+  while (l > 0) {
+    int8_t a = *r1++;
+    int8_t b = *r2++;
+    *output++ = sat_8(sign(a * b) * min(a, b));
+    l--;
+  }
+}
+
+template<int L>
+inline void combine_hist(const uint8_t *hist1, const uint8_t *hist2,
+                         uint8_t *hist) {
+  for (int i = 0; i < L; ++i) {
+    hist[i] = hist1[hist2[i]];
+  }
+}
+
+template<>
+inline void combine_hist<8>(const uint8_t *hist1, const uint8_t *hist2,
+                            uint8_t *hist) {
+  uint8x8_t h1 = vld1_u8(hist1);
+  uint8x8_t h2 = vld1_u8(hist2);
+  uint8x8_t h = vtbl1_u8(h1, h2);
+  vst1_u8(hist, h);
+}
+
+template<>
+inline void combine_hist<4>(const uint8_t *hist1, const uint8_t *hist2,
+                            uint8_t *hist) {
+  uint8x8_t h1 = vld1s_u8(hist1);
+  uint8x8_t h2 = vld1s_u8(hist2);
+  uint8x8_t h = vtbl1_u8(h1, h2);
+  *(uint32_t *)hist = vreinterpret_u32_u8(h)[0];
+}
+
+template<>
+inline void combine_hist<2>(const uint8_t *hist1, const uint8_t *hist2,
+                            uint8_t *hist) {
+  uint8x8_t h1 = vld1h_u8(hist1);
+  uint8x8_t h2 = vld1h_u8(hist2);
+  uint8x8_t h = vtbl1_u8(h1, h2);
+  *(uint16_t *)hist = vreinterpret_u16_u8(h)[0];
+}
+
+template<>
+inline void combine_hist<1>(const uint8_t * /*hist1*/,
+                            const uint8_t * /*hist2*/, uint8_t * /*hist*/) {
+  // nothing to do if L=1, only one choice of history.
+}
+
+} // namespace
\ No newline at end of file
diff --git a/src/UpperPHY/Polar/arm_polar_encoder.c b/src/UpperPHY/Polar/arm_polar_encoder.c
index 9441c31..cd74125 100644
--- a/src/UpperPHY/Polar/arm_polar_encoder.c
+++ b/src/UpperPHY/Polar/arm_polar_encoder.c
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
index 80fb6ae..6887a74 100644
--- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
+++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
index 8e7f72e..2d6ca39 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
index 875a525..d1c6ff5 100644
--- a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
+++ b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "utils/allocators.hpp"
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
index c1c1e3b..8d0a317 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
index f9a8418..d450c19 100644
--- a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
+++ b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
index 3e544c2..f0935c5 100644
--- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "intrinsics.h"
@@ -127,13 +127,10 @@ void trellis_termination(const float32x4_t *sys, const float32x4_t *par,
 // previous decoding stage (extrinsic)
 void decode_step(const float32x4_t *sys, const float32x4_t *par,
                  const float32x4_t *extrinsic, uint32_t k4, float32x4_t *llr,
-                 float32x4_t *alpha, float32x4_t *beta,
-                 const float32x4_t *beta_tail, float32x4x4_t *pdf4,
-                 float32x4_t l_c) {
-  uint32_t a_k_idx;
-  uint32_t a_kp1_idx;
-  uint32_t b_k_idx;
-  uint32_t b_kp1_idx;
+                 float32x4_t *alpha, const float32x4_t *beta_tail,
+                 float32x4x4_t *pdf4, float32x4_t l_c) {
+  uint32_t k_idx;
+  uint32_t kp1_idx;
 
   constexpr uint8x16_t rev_idx = {12, 13, 14, 15, 8, 9, 10, 11,
                                   4,  5,  6,  7,  0, 1, 2,  3};
@@ -187,25 +184,25 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
     // Accumulate the state transition probabilities forwards through the
     // state transition trellis starting from the known encoder start state 0
     for (uint32_t j = 0; j < 4; j++) {
-      a_k_idx = 8 * i + j * 2;
-      a_kp1_idx = a_k_idx + 2;
+      k_idx = 8 * i + j * 2;
+      kp1_idx = k_idx + 2;
 
       // We need  g0 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][0],
       //                gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][0]};
-      //         a02 = {alpha[a_k_idx][0], alpha[a_k_idx][2],
-      //                alpha[a_k_idx + 1][0], alpha[a_k_idx + 1][2]};
+      //         a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
       float32x4_t g0 = pdf4[i].val[j];
-      float32x4_t a02 = vuzp1q_f32(alpha[a_k_idx], alpha[a_k_idx + 1]);
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
       float32x4_t left_1 = vaddq_f32(g0, a02);
       // We need  g2 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][2],
       //                gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][2]};
-      //          a13 = {alpha[a_k_idx][1], alpha[a_k_idx][3],
-      //                 alpha[a_k_idx + 1][1], alpha[a_k_idx + 1][3]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
       float32x4_t g2 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), rev_idx));
-      float32x4_t a13 = vuzp2q_f32(alpha[a_k_idx], alpha[a_k_idx + 1]);
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
       float32x4_t right_1 = vaddq_f32(g2, a13);
-      alpha[a_kp1_idx] = vmaxq_f32(left_1, right_1);
+      alpha[kp1_idx] = vmaxq_f32(left_1, right_1);
 
       // We need  g1 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][1],
       //                gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][1]};
@@ -215,12 +212,14 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
       //                gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][3]};
       // which is g0 above
       float32x4_t right_2 = vaddq_f32(g0, a13);
-      alpha[a_kp1_idx + 1] = vmaxq_f32(left_2, right_2);
+      alpha[kp1_idx + 1] = vmaxq_f32(left_2, right_2);
     }
   }
 
   // Accumulate the state transition probabilities backwards through the state
   // transition trellis starting from the beginning of the precomputed tail
+  // and calculate the conditional probabilities of each bit being either 0
+  // or 1
   constexpr uint8x16_t idx_0312 = {0, 1, 2, 3, 12, 13, 14, 15,
                                    4, 5, 6, 7, 8,  9,  10, 11};
   constexpr uint8x16_t idx_3021 = {12, 13, 14, 15, 0, 1, 2, 3,
@@ -229,13 +228,19 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
                                    12, 13, 14, 15, 0, 1, 2, 3};
   constexpr uint8x16_t idx_1203 = {4, 5, 6, 7, 8,  9,  10, 11,
                                    0, 1, 2, 3, 12, 13, 14, 15};
+  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
+                                   8, 9, 10, 11, 0, 1, 2,  3};
+  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
+                                   4,  5,  6,  7,  12, 13, 14, 15};
+
+  float32x4x2_t beta_k;
+  float32x4x2_t beta_kp1 = {beta_tail[0], beta_tail[1]};
 
-  beta[8 * k4] = beta_tail[0];
-  beta[8 * k4 + 1] = beta_tail[1];
   for (int32_t i = k4 - 1; i >= 0; i--) {
+    float32x4_t prob_0;
+    float32x4_t prob_1;
     for (int32_t j = 3; j >= 0; j--) {
-      b_k_idx = 8 * i + j * 2;
-      b_kp1_idx = b_k_idx + 2;
+      k_idx = 8 * i + j * 2;
 
       // We need  g01_02 = {gamma[g_k_idx][0], gamma[g_k_idx][2],
       //                    gamma[g_k_idx + 1][0], gamma[g_k_idx + 1][2]};
@@ -243,7 +248,7 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
       //                 beta[b_kp1_idx][1], beta[b_kp1_idx][1]};
       float32x4_t g01_02 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0312));
-      float32x4_t b01 = vzip1q_f32(beta[b_kp1_idx], beta[b_kp1_idx]);
+      float32x4_t b01 = vzip1q_f32(beta_kp1.val[0], beta_kp1.val[0]);
       float32x4_t left_1 = vaddq_f32(g01_02, b01);
 
       // We need  g13 = {gamma[g_k_idx][1], gamma[g_k_idx][3],
@@ -252,9 +257,9 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
       //                    beta[b_kp1_idx + 1][1], beta[b_kp1_idx + 1][1]};
       float32x4_t g13 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3021));
-      float32x4_t bp1_01 = vzip1q_f32(beta[b_kp1_idx + 1], beta[b_kp1_idx + 1]);
+      float32x4_t bp1_01 = vzip1q_f32(beta_kp1.val[1], beta_kp1.val[1]);
       float32x4_t right_1 = vaddq_f32(g13, bp1_01);
-      beta[b_k_idx] = vmaxq_f32(left_1, right_1);
+      beta_k.val[0] = vmaxq_f32(left_1, right_1);
 
       // We need  g23_02 = {gamma[g_k_idx + 2][0], gamma[g_k_idx + 2][2],
       //                    gamma[g_k_idx + 3][0], gamma[g_k_idx + 3][2]};
@@ -262,7 +267,7 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
       //                 beta[b_kp1_idx][3], beta[b_kp1_idx][3]};
       float32x4_t g23_02 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_2130));
-      float32x4_t b23 = vzip2q_f32(beta[b_kp1_idx], beta[b_kp1_idx]);
+      float32x4_t b23 = vzip2q_f32(beta_kp1.val[0], beta_kp1.val[0]);
       float32x4_t left_2 = vaddq_f32(g23_02, b23);
 
       // We need  g23_13 = {gamma[g_k_idx + 2][1], gamma[g_k_idx + 2][3],
@@ -271,65 +276,49 @@ void decode_step(const float32x4_t *sys, const float32x4_t *par,
       //                    beta[b_kp1_idx + 1][3], beta[b_kp1_idx + 1][3]};
       float32x4_t g23_13 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_1203));
-      float32x4_t bp1_23 = vzip2q_f32(beta[b_kp1_idx + 1], beta[b_kp1_idx + 1]);
+      float32x4_t bp1_23 = vzip2q_f32(beta_kp1.val[1], beta_kp1.val[1]);
       float32x4_t right_2 = vaddq_f32(g23_13, bp1_23);
-      beta[b_k_idx + 1] = vmaxq_f32(left_2, right_2);
-    }
-  }
-
-  // Finally calculate the conditional probabilities of each bit being either 0
-  // or 1
-  constexpr uint8x16_t idx_0220 = {0, 1, 2,  3,  8, 9, 10, 11,
-                                   8, 9, 10, 11, 0, 1, 2,  3};
-  constexpr uint8x16_t idx_3113 = {12, 13, 14, 15, 4,  5,  6,  7,
-                                   4,  5,  6,  7,  12, 13, 14, 15};
+      beta_k.val[1] = vmaxq_f32(left_2, right_2);
 
-  for (uint32_t i = 0; i < k4; i++) {
-    float32x4_t prob_0;
-    float32x4_t prob_1;
-    for (uint32_t j = 0; j < 4; j++) {
-      a_k_idx = 8 * i + j * 2;
-      b_kp1_idx = a_k_idx + 2;
-
-      // We need  a02 = {alpha[a_k_idx][0], alpha[a_k_idx][2],
-      //                 alpha[a_k_idx + 1][0], alpha[a_k_idx + 1][2]};
-      //          a13 = {alpha[a_k_idx][1], alpha[a_k_idx][3],
-      //                 alpha[a_k_idx + 1][1], alpha[a_k_idx + 1][3]};
+      // We need  a02 = {alpha[k_idx][0], alpha[k_idx][2],
+      //                 alpha[k_idx + 1][0], alpha[k_idx + 1][2]};
+      //          a13 = {alpha[k_idx][1], alpha[k_idx][3],
+      //                 alpha[k_idx + 1][1], alpha[k_idx + 1][3]};
       //       b02_13 = {beta[b_kp1_idx][0], beta[b_kp1_idx + 1][1],
       //                 beta[b_kp1_idx][2], beta[b_kp1_idx + 1][3]};
       //       b13_02 = {beta[b_kp1_idx + 1][0], beta[b_kp1_idx][1],
       //                 beta[b_kp1_idx + 1][2], beta[b_kp1_idx][3]};
-      float32x4_t a02 = vuzp1q_f32(alpha[a_k_idx], alpha[a_k_idx + 1]);
-      float32x4_t a13 = vuzp2q_f32(alpha[a_k_idx], alpha[a_k_idx + 1]);
+      float32x4_t a02 = vuzp1q_f32(alpha[k_idx], alpha[k_idx + 1]);
+      float32x4_t a13 = vuzp2q_f32(alpha[k_idx], alpha[k_idx + 1]);
       float32x4_t b02_13 =
-          vtrn2q_f32(vrev64q_f32(beta[b_kp1_idx]), beta[b_kp1_idx + 1]);
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[0]), beta_kp1.val[1]);
       float32x4_t b13_02 =
-          vtrn2q_f32(vrev64q_f32(beta[b_kp1_idx + 1]), beta[b_kp1_idx]);
+          vtrn2q_f32(vrev64q_f32(beta_kp1.val[1]), beta_kp1.val[0]);
 
       // Find the most probable path in which bit i was a 0
       // We need  g01_01 = {gamma[g_k_idx][0], gamma[g_k_idx + 1][1],
       //                   gamma[g_k_idx + 2][0], gamma[g_k_idx + 3][1]};
-      //          g32_32 = {gamma[g_k_idx][3], gamma[g_k_idx + 1][2],
-      //                   gamma[g_k_idx + 2][3], gamma[g_k_idx + 3][2]};
       float32x4_t g01_01 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_0220));
-      float32x4_t left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
-      float32x4_t right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
+      left_1 = vaddq_f32(vaddq_f32(a02, b02_13), g01_01);
+      right_1 = vaddq_f32(vaddq_f32(a13, b13_02), g01_01);
       prob_0[j] = vmaxvq_f32(vmaxq_f32(left_1, right_1));
 
       // Find the most probable path in which bit i was a 1
       // We need  g10_10 = {gamma[g_k_idx][1], gamma[g_k_idx + 1][0],
       //                   gamma[g_k_idx + 2][1], gamma[g_k_idx + 3][0]};
-      //          g23_23 = {gamma[g_k_idx][2], gamma[g_k_idx + 1][3],
-      //                   gamma[g_k_idx + 2][2], gamma[g_k_idx + 3][3]};
       float32x4_t g10_10 = vreinterpretq_f32_u8(
           vqtbl1q_u8(vreinterpretq_u8_f32(pdf4[i].val[j]), idx_3113));
-      float32x4_t left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
-      float32x4_t right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
-
+      left_2 = vaddq_f32(vaddq_f32(a02, b13_02), g10_10);
+      right_2 = vaddq_f32(vaddq_f32(a13, b02_13), g10_10);
       prob_1[j] = vmaxvq_f32(vmaxq_f32(left_2, right_2));
+
+      // Store the current value of beta to use in the next
+      // round of calculations
+      beta_kp1 = beta_k;
     }
-    // Calculate LLR
+
+    // Calculate the LLRs
     llr[i] = vsubq_f32(prob_0, prob_1);
   }
 }
@@ -374,17 +363,14 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
 
   // Allocate space for log likelihood ratios from both stages of decoding
   auto l1_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
-  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k);
-  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k);
+  auto l2_uky = allocate_uninitialized<float32x4_t>(allocator, k4);
+  auto prev_l2_uky = allocate_zeroed<float32x4_t>(allocator, k4);
 
-  // Allocate space to hold alpha, beta and gamma
+  // Allocate space to hold alpha and gamma
   // alpha stores the forward-accumulated state probabilities for each decoded
   // bit, where the LTE encoder has 8 states and there are k+3 bits to decode
   // plus the starting condition
   auto alpha = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
-  // beta stores the backwards-accumulated state probabilities for each decoded
-  // bit
-  auto beta = allocate_uninitialized<float32x4_t>(allocator, 8 * k4 + 2);
   // gamma stores the conditional state transition probabilities for each of the
   // k+3 bits to decode
   auto gamma = allocate_uninitialized<float32x4x4_t>(allocator, k4);
@@ -426,11 +412,11 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
   // Generate the permutation vector for the input value of k
   // Find the index into the array of parameter arrays corresponding
   // to the current k. Subtract 40 because k=40 is the lowest value.
-  int param_idx = armral_turbo_tables::perm_params_lookup[(k - 40) >> 3];
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
   // and extract the correct values of f1 and f2 to build the
   // interleaving polynomial
-  uint16_t f1 = armral_turbo_tables::perm_params[param_idx][0];
-  uint16_t f2 = armral_turbo_tables::perm_params[param_idx][1];
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
   for (uint32_t i = 0; i < k; i++) {
     perm_idx[i] = generate_perm_idx(i, f1, f2, k);
   }
@@ -479,8 +465,7 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
   while (num_iter < max_iter) {
     // Run the first decoder step
     decode_step(sys_f32.get(), par_f32.get(), extrinsic.get(), k4, l1_uky.get(),
-                alpha.get(), beta.get(), beta_tail, gamma.get(),
-                channel_reliability);
+                alpha.get(), beta_tail, gamma.get(), channel_reliability);
 
     // Compute the new extrinsic information to pass into the second decoder
     update_extrinsic(k4, l1_uky.get(), extrinsic.get(), sys_f32.get());
@@ -499,8 +484,8 @@ void armral::turbo::decode_block(const int8_t *sys, const int8_t *par,
 
     // Run the second decoder step
     decode_step(perm_sys.get(), itl_f32.get(), perm_extrinsic.get(), k4,
-                l2_uky.get(), alpha.get(), beta.get(), perm_beta_tail,
-                gamma.get(), channel_reliability);
+                l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(),
+                channel_reliability);
 
     // Compute the new extrinsic information to pass back into the first encoder
     update_extrinsic(k4, l2_uky.get(), perm_extrinsic.get(), perm_sys.get());
diff --git a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
index 98e0e05..62acc61 100644
--- a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_code.hpp"
@@ -20,9 +20,9 @@ inline void rsc_encode(const uint8_t *c, uint32_t k_bytes, uint8_t &state,
   for (uint32_t k = 0; k < k_bytes; k++) {
     uint8_t input_block = c[k];
     // use input byte
-    z[k] = armral_turbo_tables::encoded_bytes[8 * input_block + state];
+    z[k] = armral::turbo::encoded_bytes[8 * input_block + state];
     // update encoder state
-    state = armral_turbo_tables::new_state_bytes[8 * input_block + state];
+    state = armral::turbo::new_state_bytes[8 * input_block + state];
   }
 }
 
@@ -33,9 +33,9 @@ inline void trellis_encode(uint8_t &state, uint8_t &x, uint8_t &z) {
   z = 0;
   // generate 3 bits of output in x and z
   for (int i = 2; i >= 0; i--) {
-    uint8_t symbol = armral_turbo_tables::trellis_output_symbol[8 * i + state];
+    uint8_t symbol = armral::turbo::trellis_output_symbol[8 * i + state];
     x |= symbol;
-    symbol = armral_turbo_tables::trellis_encoded_symbol[8 * i + state];
+    symbol = armral::turbo::trellis_encoded_symbol[8 * i + state];
     z |= symbol;
     // the state transitions here are:
     // old state  0 1 2 3 4 5 6 7
@@ -86,11 +86,11 @@ inline void terminate_trellis(uint8_t &state0, uint8_t &state1, uint8_t *d0,
 inline void interleave(const uint8_t *c, uint8_t *c_prime, uint32_t k) {
   // find the index into the array of parameter arrays corresponding
   // to the current k. Subtract 40 because k=40 is the lowest value.
-  int param_idx = armral_turbo_tables::perm_params_lookup[(k - 40) >> 3];
+  int param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3];
   // and extract the correct values of f1 and f2 to build the
   // interleaving polynomial
-  uint16_t f1 = armral_turbo_tables::perm_params[param_idx][0];
-  uint16_t f2 = armral_turbo_tables::perm_params[param_idx][1];
+  uint16_t f1 = armral::turbo::perm_params[param_idx][0];
+  uint16_t f2 = armral::turbo::perm_params[param_idx][1];
   for (uint32_t i = 0; i < k; i++) {
     // 0 <= perm_idx < 6144 but f2*i*i may be much larger
     int perm_idx = armral::turbo::generate_perm_idx(i, f1, f2, k);
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
index 9e8c54c..7048c97 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "turbo_tables.hpp"
@@ -76,22 +76,21 @@ static void subblock_interleave(uint32_t d, uint32_t kw, const uint8_t *d0,
   dummy2[ndb] = 0xF0;
 
   // Number of rows of the information bit matrix
-  assert(kpi % armral_turbo_tables::ctc == 0);
-  const uint32_t rtc = kpi / armral_turbo_tables::ctc;
+  assert(kpi % armral::turbo::ctc == 0);
+  const uint32_t rtc = kpi / armral::turbo::ctc;
 
   // Perform inter-column permutation for each row of d^(0)_k and d^(1)_k
   for (uint32_t i = 0; i < rtc; ++i) {
-    for (uint32_t j = 0; j < armral_turbo_tables::ctc; ++j) {
-      uint32_t idx =
-          (armral_turbo_tables::p[j] + i * armral_turbo_tables::ctc) / 8;
-      uint32_t jdx = 7 & ~armral_turbo_tables::p[j];
+    for (uint32_t j = 0; j < armral::turbo::ctc; ++j) {
+      uint32_t idx = (armral::turbo::p[j] + i * armral::turbo::ctc) / 8;
+      uint32_t jdx = 7 & ~armral::turbo::p[j];
       uint32_t y0bit = (y0[idx] >> jdx) & 1;
       uint32_t y1bit = (y1[idx] >> jdx) & 1;
       uint32_t yperm_jdx = 7 & ~j;
-      work_buffers.y0_perm[(j + i * armral_turbo_tables::ctc) / 8] |=
-          y0bit << yperm_jdx;
-      work_buffers.y1_perm[(j + i * armral_turbo_tables::ctc) / 8] |=
-          y1bit << yperm_jdx;
+      work_buffers.y0_perm[(j + i * armral::turbo::ctc) / 8] |= y0bit
+                                                                << yperm_jdx;
+      work_buffers.y1_perm[(j + i * armral::turbo::ctc) / 8] |= y1bit
+                                                                << yperm_jdx;
     }
   }
 
@@ -120,14 +119,13 @@ static void subblock_interleave(uint32_t d, uint32_t kw, const uint8_t *d0,
 
   // Read out permuted matrix column by column for y^(0) and y^(1)
   // Perform permutation for y^(2)
-  for (uint32_t j = 0; j < armral_turbo_tables::ctc; ++j) {
+  for (uint32_t j = 0; j < armral::turbo::ctc; ++j) {
     for (uint32_t i = 0; i < rtc; ++i) {
-      uint32_t pi =
-          (armral_turbo_tables::p[j] + i * armral_turbo_tables::ctc + 1) % kpi;
+      uint32_t pi = (armral::turbo::p[j] + i * armral::turbo::ctc + 1) % kpi;
       uint32_t vidx = (i + j * rtc) / 8;
       uint32_t vjdx = 7 & ~(i + j * rtc);
-      uint32_t y0idx = (j + i * armral_turbo_tables::ctc) / 8;
-      uint32_t y0jdx = 7 & ~(j + i * armral_turbo_tables::ctc);
+      uint32_t y0idx = (j + i * armral::turbo::ctc) / 8;
+      uint32_t y0jdx = 7 & ~(j + i * armral::turbo::ctc);
       uint32_t y2idx = pi / 8;
       uint32_t y2jdx = 7 & ~pi;
       v0[vidx] |= ((work_buffers.y0_perm[y0idx] >> y0jdx) & 1) << vjdx;
@@ -212,9 +210,8 @@ armral_status rate_matching(uint32_t d, uint32_t e, uint32_t rv,
   assert(rv >= 0 && rv <= 3);
 
   // The minimum number of rows which gives rtc * ctc >= d.
-  const uint32_t rtc =
-      (d + armral_turbo_tables::ctc - 1) / armral_turbo_tables::ctc;
-  const uint32_t kpi = rtc * armral_turbo_tables::ctc;
+  const uint32_t rtc = (d + armral::turbo::ctc - 1) / armral::turbo::ctc;
+  const uint32_t kpi = rtc * armral::turbo::ctc;
   const uint32_t kw = 3 * kpi;
   const uint32_t kpib = kpi / 8;
   const uint32_t kwb = kw / 8;
diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
index 85212d6..416cd38 100644
--- a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
+++ b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -26,22 +26,21 @@ static void generate_dummy_bits_tracking(uint32_t d, uint32_t rtc,
   // in the turbo code. Where these are in the overall input data stream is
   // determined in a manner similar to the encoding, described in section
   // 5.1.4.1.1 in 3GPP specification 36.212.
-  const uint32_t kpi = rtc * armral_turbo_tables::ctc;
+  const uint32_t kpi = rtc * armral::turbo::ctc;
   const uint32_t nd = kpi - d;
 
   // Tag nd elements as dummy bits.
   // dummy0 and dummy1 are permuted and transposed.
   for (uint32_t i = 0; i < nd; ++i) {
-    work_buffers.dummy0[armral_turbo_tables::p[i] * rtc] = 1;
-    work_buffers.dummy1[armral_turbo_tables::p[i] * rtc] = 1;
+    work_buffers.dummy0[armral::turbo::p[i] * rtc] = 1;
+    work_buffers.dummy1[armral::turbo::p[i] * rtc] = 1;
   }
   // Permutation for dummy2
   for (uint32_t i = 0; i < kpi; ++i) {
     // TODO: We don't need to go through all of kpi here. We should be able to
     //       identify where each of the nd < crc = 32 bits goes.
-    uint32_t pi = (armral_turbo_tables::p[i / rtc] +
-                   armral_turbo_tables::ctc * (i % rtc) + 1) %
-                  kpi;
+    uint32_t pi =
+        (armral::turbo::p[i / rtc] + armral::turbo::ctc * (i % rtc) + 1) % kpi;
     if (pi < nd) {
       work_buffers.dummy2[i] = 1;
     }
@@ -70,23 +69,22 @@ subblock_deinterleave(uint32_t d, uint32_t rtc, const int8_t *v0,
                       int8_t *d1, int8_t *d2,
                       subblock_deinterleave_work_buffers work_buffers) {
 
-  const uint32_t kpi = rtc * armral_turbo_tables::ctc;
+  const uint32_t kpi = rtc * armral::turbo::ctc;
   const uint32_t nd = kpi - d;
 
   // Reverse permutation and transpose for d^(0)_k and d^(1)_k
   for (uint32_t i = 0; i < rtc; ++i) {
-    for (uint32_t j = 0; j < armral_turbo_tables::ctc; ++j) {
-      uint32_t k = j + i * armral_turbo_tables::ctc;
-      work_buffers.y0[k] = v0[i + armral_turbo_tables::p[j] * rtc];
-      work_buffers.y1[k] = v1[i + armral_turbo_tables::p[j] * rtc];
+    for (uint32_t j = 0; j < armral::turbo::ctc; ++j) {
+      uint32_t k = j + i * armral::turbo::ctc;
+      work_buffers.y0[k] = v0[i + armral::turbo::p[j] * rtc];
+      work_buffers.y1[k] = v1[i + armral::turbo::p[j] * rtc];
     }
   }
 
   // Reverse permutation for d^(2)_k
   for (uint32_t i = 0; i < kpi; ++i) {
-    uint32_t pi = (armral_turbo_tables::p[i / rtc] +
-                   armral_turbo_tables::ctc * (i % rtc) + 1) %
-                  kpi;
+    uint32_t pi =
+        (armral::turbo::p[i / rtc] + armral::turbo::ctc * (i % rtc) + 1) % kpi;
     work_buffers.y2[pi] = v2[i];
   }
 
@@ -133,17 +131,16 @@ static void bit_deselection(uint32_t ncb, uint32_t k0, uint32_t e,
 }
 
 template<typename Allocator>
-armral_status turbo_rate_recovery(uint32_t d, uint32_t e, uint32_t rv,
-                                  const int8_t *src, int8_t *dst0, int8_t *dst1,
-                                  int8_t *dst2, Allocator &allocator) {
+armral_status rate_recovery(uint32_t d, uint32_t e, uint32_t rv,
+                            const int8_t *src, int8_t *dst0, int8_t *dst1,
+                            int8_t *dst2, Allocator &allocator) {
   assert(d > 0);
   assert(e > 0);
   assert(rv >= 0 && rv <= 3);
 
   // The minimum number of rows which gives rtc * ctc >= d.
-  const uint32_t rtc =
-      (d + armral_turbo_tables::ctc - 1) / armral_turbo_tables::ctc;
-  const uint32_t kpi = rtc * armral_turbo_tables::ctc;
+  const uint32_t rtc = (d + armral::turbo::ctc - 1) / armral::turbo::ctc;
+  const uint32_t kpi = rtc * armral::turbo::ctc;
   const uint32_t kw = 3 * kpi;
 
   auto dummy = allocate_zeroed<int8_t>(allocator, kpi * 3);
@@ -197,8 +194,8 @@ armral_status armral_turbo_rate_recovery(uint32_t d, uint32_t e, uint32_t rv,
                                          const int8_t *src, int8_t *dst0,
                                          int8_t *dst1, int8_t *dst2) {
   heap_allocator allocator{};
-  return armral::turbo::turbo_rate_recovery(d, e, rv, src, dst0, dst1, dst2,
-                                            allocator);
+  return armral::turbo::rate_recovery(d, e, rv, src, dst0, dst1, dst2,
+                                      allocator);
 }
 
 armral_status armral_turbo_rate_recovery_noalloc(uint32_t d, uint32_t e,
@@ -206,14 +203,14 @@ armral_status armral_turbo_rate_recovery_noalloc(uint32_t d, uint32_t e,
                                                  int8_t *dst0, int8_t *dst1,
                                                  int8_t *dst2, void *buffer) {
   buffer_bump_allocator allocator{buffer};
-  return armral::turbo::turbo_rate_recovery(d, e, rv, src, dst0, dst1, dst2,
-                                            allocator);
+  return armral::turbo::rate_recovery(d, e, rv, src, dst0, dst1, dst2,
+                                      allocator);
 }
 
 uint32_t armral_turbo_rate_recovery_noalloc_buffer_size(uint32_t d, uint32_t e,
                                                         uint32_t rv) {
   counting_allocator allocator{};
-  (void)armral::turbo::turbo_rate_recovery(d, e, rv, nullptr, nullptr, nullptr,
-                                           nullptr, allocator);
+  (void)armral::turbo::rate_recovery(d, e, rv, nullptr, nullptr, nullptr,
+                                     nullptr, allocator);
   return allocator.required_bytes();
 }
diff --git a/src/UpperPHY/Turbo/turbo_code.hpp b/src/UpperPHY/Turbo/turbo_code.hpp
index 178780a..f389fed 100644
--- a/src/UpperPHY/Turbo/turbo_code.hpp
+++ b/src/UpperPHY/Turbo/turbo_code.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/src/UpperPHY/Turbo/turbo_tables.hpp b/src/UpperPHY/Turbo/turbo_tables.hpp
index 1f9dd5c..1a59ae8 100644
--- a/src/UpperPHY/Turbo/turbo_tables.hpp
+++ b/src/UpperPHY/Turbo/turbo_tables.hpp
@@ -1,11 +1,11 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
 
-namespace armral_turbo_tables {
+namespace armral::turbo {
 
 // Precomputed encoded outputs from the RSC encoder indexed by the
 // current state of the internal registers (curr_state) and which bit
@@ -381,4 +381,4 @@ static constexpr uint8_t p[ctc] = {0,  16, 8,  24, 4,  20, 12, 28, 2,  18, 10,
                                    26, 6,  22, 14, 30, 1,  17, 9,  25, 5,  21,
                                    13, 29, 3,  19, 11, 27, 7,  23, 15, 31};
 
-} // namespace armral_turbo_tables
+} // namespace armral::turbo
diff --git a/src/intrinsics.h b/src/intrinsics.h
index b2dc080..7fd26f0 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -380,6 +380,33 @@ vst1q_f32_x2(float32_t *dest, float32x4x2_t value) {
 #endif
 
 #ifdef __ARM_FEATURE_SVE
+static inline svuint16_t __attribute__((always_inline, artificial))
+svld1rh_u16(svbool_t pg, const uint8_t *ptr) {
+  svuint16_t ret;
+  asm("ld1rh {%0.h}, %1/Z, %2"
+      : "=w"(ret)
+      : "Upl"(pg), "Q"(*(const uint16_t *)ptr));
+  return ret;
+}
+
+static inline svuint32_t __attribute__((always_inline, artificial))
+svld1rw_u32(svbool_t pg, const uint8_t *ptr) {
+  svuint32_t ret;
+  asm("ld1rw {%0.s}, %1/Z, %2"
+      : "=w"(ret)
+      : "Upl"(pg), "Q"(*(const uint32_t *)ptr));
+  return ret;
+}
+
+static inline svuint64_t __attribute__((always_inline, artificial))
+svld1rd_u64(svbool_t pg, const uint8_t *ptr) {
+  svuint64_t ret;
+  asm("ld1rd {%0.d}, %1/Z, %2"
+      : "=w"(ret)
+      : "Upl"(pg), "Q"(*(const uint64_t *)ptr));
+  return ret;
+}
+
 // Reverses pairs of floats within a SVE vector
 // [a.re a.im b.re b.im] --> [a.im a.re b.im b.re]
 static inline svfloat32_t __attribute__((always_inline, artificial))
diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp
index 3a13bc2..e664173 100644
--- a/src/utils/allocators.hpp
+++ b/src/utils/allocators.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/cmplx_arith_f32.hpp b/src/utils/cmplx_arith_f32.hpp
index 5e3eb4b..32644da 100644
--- a/src/utils/cmplx_arith_f32.hpp
+++ b/src/utils/cmplx_arith_f32.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/src/utils/vec_mul.hpp b/src/utils/vec_mul.hpp
index f433685..2c4896a 100644
--- a/src/utils/vec_mul.hpp
+++ b/src/utils/vec_mul.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
diff --git a/test/CRC/main.cpp b/test/CRC/main.cpp
index ee9882d..ed3941a 100644
--- a/test/CRC/main.cpp
+++ b/test/CRC/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ConvCoding/decoding/main.cpp b/test/ConvCoding/decoding/main.cpp
index b20a62a..d768fbe 100644
--- a/test/ConvCoding/decoding/main.cpp
+++ b/test/ConvCoding/decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -73,8 +73,10 @@ static bool run_convolutional_decoding_test(
   bool passed = true;
 
   if (ret != ARMRAL_SUCCESS) {
+    // GCOVR_EXCL_START
     printf("Error! [%s] k=%u did not return ARMRAL_SUCCESS\n", name, k);
     passed = false;
+    // GCOVR_EXCL_STOP
   } else {
     printf("[%s] k=%u\n", name, k);
     auto check_dst =
diff --git a/test/ConvCoding/encoding/main.cpp b/test/ConvCoding/encoding/main.cpp
index 1087714..fab64d1 100644
--- a/test/ConvCoding/encoding/main.cpp
+++ b/test/ConvCoding/encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
@@ -70,9 +70,11 @@ static bool run_convolutional_encoding_test(int k) {
 
   bool passed = true;
   if (ret != ARMRAL_SUCCESS) {
+    // GCOVR_EXCL_START
     const char *name = "Convolutional_Encoding";
     printf("Error! [%s_%d] did not return ARMRAL_SUCCESS\n", name, k);
     passed = false;
+    // GCOVR_EXCL_STOP
   } else {
     auto check_dst0 =
         check_results_u8("CONVOLUTIONAL ENCODING (STREAM D0)", dst0.data(),
diff --git a/test/Correlation/main.cpp b/test/Correlation/main.cpp
index 707da33..192ba5f 100644
--- a/test/Correlation/main.cpp
+++ b/test/Correlation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/Demodulation/main.cpp b/test/Demodulation/main.cpp
index 73a0545..4833b65 100644
--- a/test/Demodulation/main.cpp
+++ b/test/Demodulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/ElemWiseVectorMult/vecMul16/main.cpp b/test/ElemWiseVectorMult/vecMul16/main.cpp
index ca798fc..ea5da35 100644
--- a/test/ElemWiseVectorMult/vecMul16/main.cpp
+++ b/test/ElemWiseVectorMult/vecMul16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
@@ -87,7 +87,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_mul_test(n);
   }
   const int saturation_len[] = {1, 3, 8, 9};
diff --git a/test/ElemWiseVectorMult/vecMul16_2/main.cpp b/test/ElemWiseVectorMult/vecMul16_2/main.cpp
index 623335e..ed6cb76 100644
--- a/test/ElemWiseVectorMult/vecMul16_2/main.cpp
+++ b/test/ElemWiseVectorMult/vecMul16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
@@ -104,7 +104,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_mul_test(n);
   }
   const int saturation_len[] = {
diff --git a/test/ElemWiseVectorMult/vecMul32/main.cpp b/test/ElemWiseVectorMult/vecMul32/main.cpp
index fc98847..9ac33ac 100644
--- a/test/ElemWiseVectorMult/vecMul32/main.cpp
+++ b/test/ElemWiseVectorMult/vecMul32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -30,7 +30,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_mul_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/ElemWiseVectorMult/vecMul32_2/main.cpp b/test/ElemWiseVectorMult/vecMul32_2/main.cpp
index f5fda1c..323367c 100644
--- a/test/ElemWiseVectorMult/vecMul32_2/main.cpp
+++ b/test/ElemWiseVectorMult/vecMul32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -41,7 +41,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_mul_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/FFT/cf32/main.cpp b/test/FFT/cf32/main.cpp
index ac0c381..ed8483c 100644
--- a/test/FFT/cf32/main.cpp
+++ b/test/FFT/cf32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
diff --git a/test/FFT/cs16/main.cpp b/test/FFT/cs16/main.cpp
index 3fc058e..6f88d57 100644
--- a/test/FFT/cs16/main.cpp
+++ b/test/FFT/cs16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
diff --git a/test/FIR/arm_fir_filter_cf32/main.cpp b/test/FIR/arm_fir_filter_cf32/main.cpp
index f598a4b..c8c3643 100644
--- a/test/FIR/arm_fir_filter_cf32/main.cpp
+++ b/test/FIR/arm_fir_filter_cf32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -48,7 +48,7 @@ int main(int argc, char **argv) {
       {8192, 32}, {10240, 4},
   };
   bool passed = true;
-  for (auto &p : params) {
+  for (const auto &p : params) {
     passed &= run_fir_test(p.first, p.second);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp b/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
index b65a01c..9c8c8e8 100644
--- a/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
+++ b/test/FIR/arm_fir_filter_cf32_decimate_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -54,7 +54,7 @@ int main(int argc, char **argv) {
       {8192, 32}, {10240, 32},
   };
   bool passed = true;
-  for (auto &p : params) {
+  for (const auto &p : params) {
     passed &= run_fir_test(p.first, p.second);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/FIR/arm_fir_filter_cs16/main.cpp b/test/FIR/arm_fir_filter_cs16/main.cpp
index e8c21c6..7103678 100644
--- a/test/FIR/arm_fir_filter_cs16/main.cpp
+++ b/test/FIR/arm_fir_filter_cs16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
diff --git a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp b/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
index eca801e..a247b98 100644
--- a/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
+++ b/test/FIR/arm_fir_filter_cs16_decimate_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 
@@ -54,7 +54,7 @@ int main(int argc, char **argv) {
       {8192, 32}, {10240, 32},
   };
   bool passed = true;
-  for (auto &p : params) {
+  for (const auto &p : params) {
     passed &= run_fir_test(p.first, p.second);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/LDPC/decoding/main.cpp b/test/LDPC/decoding/main.cpp
index 809d114..9362a05 100644
--- a/test/LDPC/decoding/main.cpp
+++ b/test/LDPC/decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "../ldpc_test_common.hpp"
diff --git a/test/LDPC/encoding/ldpc_encoding_test_data.h b/test/LDPC/encoding/ldpc_encoding_test_data.h
index 792bcb8..09947ba 100644
--- a/test/LDPC/encoding/ldpc_encoding_test_data.h
+++ b/test/LDPC/encoding/ldpc_encoding_test_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/LDPC/encoding/main.cpp b/test/LDPC/encoding/main.cpp
index 2c2ba9b..8b78ea4 100644
--- a/test/LDPC/encoding/main.cpp
+++ b/test/LDPC/encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "../ldpc_test_common.hpp"
 #include "armral.h"
@@ -8,6 +8,7 @@
 #include "int8_utils.hpp"
 #include "ldpc_coding.hpp"
 #include "ldpc_encoding_test_data.h"
+
 #include <cstdlib>
 #include <cstring>
 
@@ -155,7 +156,7 @@ inline void set_remaining_bits(armral_ldpc_graph_t bg, uint32_t z, uint32_t lsi,
     // the number of index sets (8), and then the lifting set index
     // is added to  this
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * col_entries;
     uint32_t j = 0;
     for (; j < col_entries && col_ptr[j] < max_ind; ++j) {
@@ -210,7 +211,7 @@ std::vector<uint8_t> armral_ldpc_encode_block_ref(const uint8_t *data_in,
     // is first offset by the row start index multiplied by
     // the number of index sets (8), and then
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * col_entries;
     uint32_t j = 0;
     for (; j < col_entries && col_ptr[j] < max_message_ind; ++j) {
@@ -354,7 +355,9 @@ bool test_ldpc_encode_block(
     passed &= check_bytes_equal(encoding_bytes, punctured, tc);
   }
   if (!passed) {
+    // GCOVR_EXCL_START
     printf("[%s] one or more tests failed!\n", name);
+    // GCOVR_EXCL_STOP
   }
   return passed;
 }
diff --git a/test/LDPC/ldpc_test_common.hpp b/test/LDPC/ldpc_test_common.hpp
index 7ad4960..0623f9f 100644
--- a/test/LDPC/ldpc_test_common.hpp
+++ b/test/LDPC/ldpc_test_common.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "armral.h"
@@ -42,7 +42,7 @@ bool perform_parity_check(const uint8_t *c, uint32_t z,
     auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
     const auto *col_ptr = graph->col_inds + row_start_ind;
     const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral_ldpc::num_lifting_sets +
+                            row_start_ind * armral::ldpc::num_lifting_sets +
                             lsi * num_cols;
     // Loop through the rows in the block
     for (uint32_t zb = 0; zb < z; ++zb) {
diff --git a/test/LDPC/rate_matching/main.cpp b/test/LDPC/rate_matching/main.cpp
index e677b6f..783c882 100644
--- a/test/LDPC/rate_matching/main.cpp
+++ b/test/LDPC/rate_matching/main.cpp
@@ -1,10 +1,11 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
 #include "int8_utils.hpp"
+
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -34,7 +35,7 @@ void ref_bit_selection(uint32_t z, uint32_t n, uint32_t e,
   auto *scratch_ptr2 = scratch_buf2.data();
 
   // bit selection as specified by section 5.4.2.1 in 3GPP TS 38.212
-  // remove Filler bits
+  // remove filler bits
   if (len_filler_bits > 0) {
 
     uint32_t len_s_f_bits = k - z * 2; // length of systematic & filler bits
@@ -47,9 +48,9 @@ void ref_bit_selection(uint32_t z, uint32_t n, uint32_t e,
       uint32_t len_s_bytes = len_s_bits >> 3;
       uint32_t len_p_bytes = len_p_bits >> 3;
 
-      memcpy(scratch_ptr1, in, len_s_bytes); // skip Filler bits
+      memcpy(scratch_ptr1, in, len_s_bytes); // skip filler bits
       memcpy(&scratch_ptr1[len_s_bytes], &in[len_s_f_bytes],
-             len_p_bytes); // skip Filler bits
+             len_p_bytes); // skip filler bits
 
     } else {
 
@@ -67,7 +68,7 @@ void ref_bit_selection(uint32_t z, uint32_t n, uint32_t e,
   // k0 depends on the redundancy version id.
   assert(n > 0);
   assert(e > 0);
-  assert(k0 >= 0 && k0 < n);
+  assert(k0 < n);
   assert(n % 2 == 0);
 
   for (uint32_t i = 0; i < (e + 7) / 8; i++) {
@@ -241,8 +242,8 @@ bool test_ref_rate_matching() {
   return passed;
 }
 
-int starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
-                      uint32_t ncb, uint32_t z) {
+uint32_t starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
+                           uint32_t ncb, uint32_t z) {
   // Starting position k0 of different redundancy versions
   // given as Table 5.4.2.1-2 in 3GPP TS 38.212, simplified
   // using the assumption N_cb = 66 * Z_c (base graph 1) or
@@ -377,7 +378,7 @@ bool test_ldpc_rate_matching(
           if (bg == LDPC_BASE_GRAPH_2) {
             g = 10 * z;
           }
-          // cosider single layer, single CB .
+          // Consider single layer, single CB.
           uint32_t num_res =
               qm *
               (mod == ARMRAL_MOD_QPSK ? 144 : 32); // 12 symbols or 3 symbols
diff --git a/test/LDPC/rate_recovery/main.cpp b/test/LDPC/rate_recovery/main.cpp
index 1993a3d..499d98b 100644
--- a/test/LDPC/rate_recovery/main.cpp
+++ b/test/LDPC/rate_recovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
@@ -18,7 +18,7 @@ void ref_undo_selection(uint32_t z, uint32_t n, uint32_t e,
                         const int8_t *in, int8_t *out) {
   // performs the inverse of the bit selection as specified by
   // section 5.4.2.1 in 3GPP TS 38.212
-  assert(k0 >= 0 && k0 < n);
+  assert(k0 < n);
   assert(e > 0);
 
   // As we aggregate LLRs, for a single message, out should be zero on entry.
@@ -122,8 +122,8 @@ void ref_undo_interleave(uint32_t e, uint32_t qm, const int8_t *in,
   }
 }
 
-int starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
-                      uint32_t ncb, uint32_t z) {
+uint32_t starting_position(armral_ldpc_graph_t bg, uint32_t rv, uint32_t n,
+                           uint32_t ncb, uint32_t z) {
   // Duplicate of function with same name in rate_matching
 
   // Starting position k0 of different redundancy versions
@@ -363,7 +363,9 @@ bool test_ldpc_rate_recovery(
   }
 
   if (!passed) {
+    // GCOVR_EXCL_START
     printf("[%s] one or more tests failed!\n", name);
+    // GCOVR_EXCL_STOP
   }
   return passed;
 }
diff --git a/test/MatrixInv/batch/main.cpp b/test/MatrixInv/batch/main.cpp
index 22e0399..74b1fcd 100644
--- a/test/MatrixInv/batch/main.cpp
+++ b/test/MatrixInv/batch/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
@@ -26,17 +26,17 @@ static void reference_parallel_matinv_block(uint32_t m,
     printf(" > Check ref matrix %u/%u\n", batch + 1, batch_size);
 
     // allocate temporary matrices
-    armral_cmplx_f32_t mat[m * m];
-    armral_cmplx_f32_t res[m * m];
+    std::vector<armral_cmplx_f32_t> mat(m * m);
+    std::vector<armral_cmplx_f32_t> res(m * m);
 
     // unpack matrix
-    unpack_data(batch, batch_size, a, mat, m * m);
+    unpack_data(batch, batch_size, a, mat.data(), m * m);
 
     // run inversion on each matrix (sequentially)
-    reference_matinv_block(m, mat, res);
+    reference_matinv_block(m, mat.data(), res.data());
 
     // pack result
-    pack_data(batch, batch_size, res, b, m * m);
+    pack_data(batch, batch_size, res.data(), b, m * m);
   }
 }
 
diff --git a/test/MatrixInv/single/main.cpp b/test/MatrixInv/single/main.cpp
index 3280e11..9d7e3f9 100644
--- a/test/MatrixInv/single/main.cpp
+++ b/test/MatrixInv/single/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cf32_utils.hpp"
diff --git a/test/MatrixMult/batch/ArmSolve/main.cpp b/test/MatrixMult/batch/ArmSolve/main.cpp
index 5b09a82..4ce7201 100644
--- a/test/MatrixMult/batch/ArmSolve/main.cpp
+++ b/test/MatrixMult/batch/ArmSolve/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "cs16_utils.hpp"
diff --git a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp b/test/MatrixMult/batch/MatrixVectorMult16/main.cpp
index c8b14c0..c8e8510 100644
--- a/test/MatrixMult/batch/MatrixVectorMult16/main.cpp
+++ b/test/MatrixMult/batch/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "matrix_utils.hpp"
diff --git a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp b/test/MatrixMult/batch/MatrixVectorMult32/main.cpp
index 4872050..41f3c73 100644
--- a/test/MatrixMult/batch/MatrixVectorMult32/main.cpp
+++ b/test/MatrixMult/batch/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "cf32_utils.hpp"
diff --git a/test/MatrixMult/single/MatrixMult16/main.cpp b/test/MatrixMult/single/MatrixMult16/main.cpp
index 98fed6d..e882b7d 100644
--- a/test/MatrixMult/single/MatrixMult16/main.cpp
+++ b/test/MatrixMult/single/MatrixMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "reference_linalg.hpp"
 
diff --git a/test/MatrixMult/single/MatrixMult32/main.cpp b/test/MatrixMult/single/MatrixMult32/main.cpp
index 1f7b66b..a77c7f8 100644
--- a/test/MatrixMult/single/MatrixMult32/main.cpp
+++ b/test/MatrixMult/single/MatrixMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
diff --git a/test/MatrixMult/single/MatrixMultAAH32/main.cpp b/test/MatrixMult/single/MatrixMultAAH32/main.cpp
index b6616bf..83c4e01 100644
--- a/test/MatrixMult/single/MatrixMultAAH32/main.cpp
+++ b/test/MatrixMult/single/MatrixMultAAH32/main.cpp
@@ -1,8 +1,9 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
+#include "matrix_utils.hpp"
 #include "reference_linalg.hpp"
 
 static bool run_matmul_aah_cf32_test(uint16_t m, uint16_t n) {
diff --git a/test/MatrixMult/single/MatrixMultAHB32/main.cpp b/test/MatrixMult/single/MatrixMultAHB32/main.cpp
index 69b4498..c09b5b7 100644
--- a/test/MatrixMult/single/MatrixMultAHB32/main.cpp
+++ b/test/MatrixMult/single/MatrixMultAHB32/main.cpp
@@ -1,12 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include <array>
 #include <cstdio>
 
 #include "cf32_utils.hpp"
+#include "matrix_utils.hpp"
 #include "reference_linalg.hpp"
 
 static bool run_matmul_ahb_cf32_test(uint16_t m, uint16_t n, uint16_t k) {
diff --git a/test/MatrixMult/single/MatrixVectorMult16/main.cpp b/test/MatrixMult/single/MatrixVectorMult16/main.cpp
index f9acbdf..f6d987d 100644
--- a/test/MatrixMult/single/MatrixVectorMult16/main.cpp
+++ b/test/MatrixMult/single/MatrixVectorMult16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "reference_linalg.hpp"
diff --git a/test/MatrixMult/single/MatrixVectorMult32/main.cpp b/test/MatrixMult/single/MatrixVectorMult32/main.cpp
index a60d304..4283a28 100644
--- a/test/MatrixMult/single/MatrixVectorMult32/main.cpp
+++ b/test/MatrixMult/single/MatrixVectorMult32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "reference_linalg.hpp"
diff --git a/test/MatrixPseudoInv/direct/main.cpp b/test/MatrixPseudoInv/direct/main.cpp
index ab578c6..74f99d8 100644
--- a/test/MatrixPseudoInv/direct/main.cpp
+++ b/test/MatrixPseudoInv/direct/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 #include "matrix_utils.hpp"
@@ -10,9 +10,32 @@
 #include <vector>
 
 static inline void
-reference_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
-                                const armral_cmplx_f32_t *__restrict p_src,
-                                armral_cmplx_f32_t *p_dst) {
+reference_left_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
+                                     const armral_cmplx_f32_t *__restrict p_src,
+                                     armral_cmplx_f32_t *p_dst) {
+  // Compute C = A^H * A
+  // We can use p_dst as an intermediate N-by-N array since it has size N-by-M,
+  // and N < M
+  auto *mat_aha = p_dst;
+  reference_matmul_aha_cf32(m, n, p_src, mat_aha);
+
+  // Compute C + lambda * I
+  for (uint32_t i = 0; i < n; i++) {
+    uint32_t idx = i * n + i;
+    mat_aha[idx].re += lambda;
+  }
+
+  // Compute B = C^(-1)
+  std::vector<armral_cmplx_f32_t> mat_inv(n * n);
+  reference_matinv_block(n, mat_aha, mat_inv.data());
+
+  // Compute B * A^H
+  reference_matmul_bah_cf32(m, n, p_src, mat_inv.data(), p_dst);
+}
+
+static inline void reference_right_pseudo_inverse_direct(
+    uint32_t m, uint32_t n, float32_t lambda,
+    const armral_cmplx_f32_t *__restrict p_src, armral_cmplx_f32_t *p_dst) {
   // Compute C = A * A^H
   // We can use p_dst as an intermediate M-by-M array since it has size N-by-M,
   // and N >= M
@@ -33,6 +56,16 @@ reference_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
   reference_matmul_ahb_cf32(m, n, m, p_src, mat_inv.data(), p_dst);
 }
 
+static inline void
+reference_pseudo_inverse_direct(uint32_t m, uint32_t n, float32_t lambda,
+                                const armral_cmplx_f32_t *__restrict p_src,
+                                armral_cmplx_f32_t *p_dst) {
+  if (m > n) {
+    return reference_left_pseudo_inverse_direct(m, n, lambda, p_src, p_dst);
+  }
+  return reference_right_pseudo_inverse_direct(m, n, lambda, p_src, p_dst);
+}
+
 template<typename PseudoInverseFunction>
 static bool run_pseudo_inverse_direct_cf32_test(
     const char *name, uint32_t m, uint32_t n, float32_t lambda,
@@ -56,19 +89,30 @@ bool run_all_tests(char const *test_name, char const *function_name,
   bool passed = true;
 
   const std::tuple<uint32_t, uint32_t, float32_t> params[] = {
-      {2, 5, -0.968591},  {2, 84, 0.191647},   {2, 67, 0.0},
-      {3, 18, -1.218053}, {3, 138, 1.597186},  {3, 161, 0.0},
-      {4, 20, -0.474817}, {4, 105, 0.944802},  {4, 94, 0.0},
-      {8, 35, -1.991369}, {8, 200, -1.244298}, {8, 190, 0.0},
-      {16, 32, 0.809352}, {16, 80, 1.810591},  {16, 117, 0.0}};
-  for (const auto &[m, n, l] : params) {
-    printf("[%s] m=%d, n=%d, l=%f\n", function_name, m, n, l);
-    passed &= run_pseudo_inverse_direct_cf32_test(function_name, m, n, l,
+      {2, 5, -0.968591},   {2, 84, 0.191647},   {2, 2, 1.457848},
+      {2, 67, 0.0},        {3, 18, -1.218053},  {3, 138, 1.597186},
+      {3, 3, -1.2435186},  {3, 161, 0.0},       {4, 20, -0.474817},
+      {4, 105, 0.944802},  {4, 4, 1.645646},    {4, 94, 0.0},
+      {8, 35, -1.991369},  {8, 200, -1.244298}, {8, 8, 1.445767},
+      {8, 190, 0.0},       {16, 32, 0.809352},  {16, 80, 1.810591},
+      {16, 16, -0.426745}, {16, 117, 0.0}};
+  for (const auto &[dim1, dim2, l] : params) {
+    printf("[%s] m=%d, n=%d, l=%f\n", function_name, dim1, dim2, l);
+    passed &= run_pseudo_inverse_direct_cf32_test(function_name, dim1, dim2, l,
                                                   pseudo_inverse_under_test);
+
+    // There is no need to test the square input cases again
+    if (dim1 != dim2) {
+      printf("[%s] m=%d, n=%d, l=%f\n", function_name, dim2, dim1, l);
+      passed &= run_pseudo_inverse_direct_cf32_test(
+          function_name, dim2, dim1, l, pseudo_inverse_under_test);
+    }
   }
 
   if (!passed) {
+    // GCOVR_EXCL_START
     printf("[%s] one or more tests failed!\n", test_name);
+    // GCOVR_EXCL_STOP
   }
 
   return passed;
@@ -77,18 +121,22 @@ bool run_all_tests(char const *test_name, char const *function_name,
 int main() {
   bool passed = true;
 
+  // Tests for pseudo-inverse
   passed &= run_all_tests("PseudoInverseDirect",
                           "armral_cmplx_pseudo_inverse_direct_f32",
                           armral_cmplx_pseudo_inverse_direct_f32);
 
-  passed &= run_all_tests(
-      "PseudoInverseDirectNoAlloc",
-      "armral_cmplx_pseudo_inverse_direct_f32_noalloc",
-      [](uint32_t m, auto... args) {
-        std::vector<uint8_t> buffer(m * m * sizeof(armral_cmplx_f32_t) + 3);
-        return armral_cmplx_pseudo_inverse_direct_f32_noalloc(m, args...,
-                                                              buffer.data());
-      });
+  // Tests for non-allocating pseudo-inverse
+  passed &=
+      run_all_tests("PseudoInverseDirectNoAlloc",
+                    "armral_cmplx_pseudo_inverse_direct_f32_noalloc",
+                    [](uint32_t m, uint32_t n, auto... args) {
+                      uint32_t size = m > n ? n : m;
+                      std::vector<uint8_t> buffer(
+                          size * size * sizeof(armral_cmplx_f32_t) + 3);
+                      return armral_cmplx_pseudo_inverse_direct_f32_noalloc(
+                          m, n, args..., buffer.data());
+                    });
 
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
 }
diff --git a/test/Modulation/main.cpp b/test/Modulation/main.cpp
index f931ae4..0cb0a3a 100644
--- a/test/Modulation/main.cpp
+++ b/test/Modulation/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/MuLaw/Compression/main.cpp b/test/MuLaw/Compression/main.cpp
index 9eba4e6..c1b8e6c 100644
--- a/test/MuLaw/Compression/main.cpp
+++ b/test/MuLaw/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/MuLaw/Decompression/main.cpp b/test/MuLaw/Decompression/main.cpp
index f3f6a17..067af2d 100644
--- a/test/MuLaw/Decompression/main.cpp
+++ b/test/MuLaw/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "cs16_utils.hpp"
diff --git a/test/ORanBlockScaling/Compression/main.cpp b/test/ORanBlockScaling/Compression/main.cpp
index ddf6d1a..ac0356c 100644
--- a/test/ORanBlockScaling/Compression/main.cpp
+++ b/test/ORanBlockScaling/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/ORanBlockScaling/Decompression/main.cpp b/test/ORanBlockScaling/Decompression/main.cpp
index 89631ba..2174493 100644
--- a/test/ORanBlockScaling/Decompression/main.cpp
+++ b/test/ORanBlockScaling/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/crc_attachment/main.cpp b/test/Polar/crc_attachment/main.cpp
index 9673d91..8d67cd4 100644
--- a/test/Polar/crc_attachment/main.cpp
+++ b/test/Polar/crc_attachment/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "int8_utils.hpp"
 #include "polar_crc_attach_data.hpp"
diff --git a/test/Polar/crc_attachment/polar_crc_attach_data.hpp b/test/Polar/crc_attachment/polar_crc_attach_data.hpp
index 8c5dfeb..cb8c986 100644
--- a/test/Polar/crc_attachment/polar_crc_attach_data.hpp
+++ b/test/Polar/crc_attachment/polar_crc_attach_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Polar/decoding/main.cpp b/test/Polar/decoding/main.cpp
index 31ae785..5b36846 100644
--- a/test/Polar/decoding/main.cpp
+++ b/test/Polar/decoding/main.cpp
@@ -1,12 +1,13 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
 #include <cstdio>
 #include <cstring>
 
+template<bool test_noalloc>
 static bool run_polar_decoding_test(uint32_t n, uint32_t e, uint32_t k,
                                     uint32_t n_pc, uint32_t n_pc_wm,
                                     uint32_t l) {
@@ -69,13 +70,16 @@ static bool run_polar_decoding_test(uint32_t n, uint32_t e, uint32_t k,
     armral_polar_subchannel_deinterleave(
         k, frozen_mask.data(), out.data() + i * n / 8, data_deint.data());
 
-#ifdef ARMRAL_TESTING_NOALLOC
-    std::vector<uint8_t> buffer(armral_polar_crc_check_noalloc_buffer_size(k));
-    bool crc_pass =
-        armral_polar_crc_check_noalloc(data_deint.data(), k, buffer.data());
-#else
-    bool crc_pass = armral_polar_crc_check(data_deint.data(), k);
-#endif
+    bool crc_pass;
+    if constexpr (test_noalloc) {
+      std::vector<uint8_t> buffer(
+          armral_polar_crc_check_noalloc_buffer_size(k));
+      crc_pass =
+          armral_polar_crc_check_noalloc(data_deint.data(), k, buffer.data());
+    } else {
+      crc_pass = armral_polar_crc_check(data_deint.data(), k);
+    }
+
     if (crc_pass) {
       return check_results_u8("POLAR DECODING", data_deint.data(), in.data(),
                               (k + 7) / 8);
@@ -94,13 +98,16 @@ int main(int argc, char **argv) {
   bool passed = true;
   for (auto l : {1, 2, 4, 8}) {
     for (auto n : {32, 64, 128, 256, 512, 1024}) {
-      for (int k = 25; k <= n; k += 7) {
+      for (int k = 25; k <= n; k += 11) {
         // test e >= n to check repetition doesn't affect the frozen mask.
-        for (int e = k; e <= n + 11; e += 11) {
-          passed &= run_polar_decoding_test(n, e, k, 0, 0, l);
+        for (int e = k; e <= n + 17; e += 17) {
+          passed &= run_polar_decoding_test<false>(n, e, k, 0, 0, l);
+          passed &= run_polar_decoding_test<true>(n, e, k, 0, 0, l);
           if (k + 3 <= n && k + 3 <= e) {
-            passed &= run_polar_decoding_test(n, e, k, 3, 0, l);
-            passed &= run_polar_decoding_test(n, e, k, 3, 1, l);
+            passed &= run_polar_decoding_test<false>(n, e, k, 3, 0, l);
+            passed &= run_polar_decoding_test<false>(n, e, k, 3, 1, l);
+            passed &= run_polar_decoding_test<true>(n, e, k, 3, 0, l);
+            passed &= run_polar_decoding_test<true>(n, e, k, 3, 1, l);
           }
         }
       }
diff --git a/test/Polar/encoding/main.cpp b/test/Polar/encoding/main.cpp
index 22220d8..7c1d9ac 100644
--- a/test/Polar/encoding/main.cpp
+++ b/test/Polar/encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Polar/frozen/main.cpp b/test/Polar/frozen/main.cpp
index 041c188..5be4671 100644
--- a/test/Polar/frozen/main.cpp
+++ b/test/Polar/frozen/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_matching/main.cpp b/test/Polar/rate_matching/main.cpp
index 00376ce..6afd323 100644
--- a/test/Polar/rate_matching/main.cpp
+++ b/test/Polar/rate_matching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/rate_recovery/main.cpp b/test/Polar/rate_recovery/main.cpp
index 5d985ff..fe3ce7d 100644
--- a/test/Polar/rate_recovery/main.cpp
+++ b/test/Polar/rate_recovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
@@ -15,15 +15,19 @@ static bool check_llrs_equal(const int8_t *result, const int8_t *expected,
   bool passed = true;
   for (uint32_t i = 0; i < n_values; ++i) {
     if (result[i] != expected[i]) {
+      // GCOVR_EXCL_START
       printf("Sample %u: LLR calculated = %d != LLR expected "
              "= %d --> ERROR \n",
              i, result[i], expected[i]);
       passed = false;
+      // GCOVR_EXCL_STOP
     }
   }
 
   if (!passed) {
+    // GCOVR_EXCL_START
     printf("Check failed!\n");
+    // GCOVR_EXCL_STOP
   } else {
     printf("Check successful!\n");
   }
diff --git a/test/Polar/subchannel_deinterleave/main.cpp b/test/Polar/subchannel_deinterleave/main.cpp
index 77ba43f..59b3ef8 100644
--- a/test/Polar/subchannel_deinterleave/main.cpp
+++ b/test/Polar/subchannel_deinterleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Polar/subchannel_interleave/main.cpp b/test/Polar/subchannel_interleave/main.cpp
index eb6b31f..bfc6e55 100644
--- a/test/Polar/subchannel_interleave/main.cpp
+++ b/test/Polar/subchannel_interleave/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/SVD/main.cpp b/test/SVD/main.cpp
index 7b1a5f9..a142790 100644
--- a/test/SVD/main.cpp
+++ b/test/SVD/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "svd_sample_data.h"
@@ -141,7 +141,9 @@ bool run_all_tests(char const *name, SVDFunction svd_function) {
   }
 
   if (!passed) {
+    // GCOVR_EXCL_START
     printf("[%s] one or more tests failed!\n", name);
+    // GCOVR_EXCL_STOP
   }
   return passed;
 }
diff --git a/test/SVD/svd_sample_data.h b/test/SVD/svd_sample_data.h
index e2cfab3..0add752 100644
--- a/test/SVD/svd_sample_data.h
+++ b/test/SVD/svd_sample_data.h
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/SVD/svd_test.hpp b/test/SVD/svd_test.hpp
index 293ed4d..a3ca485 100644
--- a/test/SVD/svd_test.hpp
+++ b/test/SVD/svd_test.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -160,7 +160,7 @@ static inline void householder_qr(const int m, const int n, cf32_t *a,
 }
 
 // Apply implicitly Q to an input matrix C of the same dimension
-// as the marix A that has been factorized into QR or bidiagonalisation.
+// as the matrix A that has been factorized into QR or bidiagonalisation.
 static inline void apply_q(int m, int n, const cf32_t *a, const cf32_t *tau,
                            cf32_t *c) {
   if (m < n) {
@@ -171,7 +171,7 @@ static inline void apply_q(int m, int n, const cf32_t *a, const cf32_t *tau,
   }
   std::vector<cf32_t> q(m * n);
   memcpy(q.data(), a, m * n * sizeof(cf32_t));
-  column_major_matrix_view q_mat{q, m};
+  column_major_matrix_view q_mat{q.data(), m};
   column_major_matrix_view c_mat{c, m};
   for (int i = n - 1; i >= 0; i--) {
     q_mat(i, i) = 1.0F;
@@ -212,7 +212,7 @@ static inline std::vector<cf32_t> get_q(const int m, const int n,
     // GCOVR_EXCL_STOP
   }
   std::vector<cf32_t> q = a;
-  column_major_matrix_view q_mat{q, m};
+  column_major_matrix_view q_mat{q.data(), m};
   // Accumulate reflectors from right to left
   // Q = H1 * H2....* Hn. They are applied to identity.
   for (int i = n - 1; i >= 0; i--) {
@@ -352,7 +352,7 @@ static inline void generate_svd_matrix(const int m, const int n,
   std::vector<cf32_t> q2 = get_q(n, n, a2, tau2);
 
   // multiply left orthogonal matrix by S
-  column_major_matrix_view q2_mat{q2, n};
+  column_major_matrix_view q2_mat{q2.data(), n};
   for (int i = 0; i < n; i++) {
     for (int j = 0; j < n; j++) {
       q2_mat(i, j) *= s[i];
@@ -360,7 +360,7 @@ static inline void generate_svd_matrix(const int m, const int n,
   }
   // Apply Q1 to S*Q2, but first copy Q2 in an m * n matrix
   std::vector<cf32_t> a_cmplx(m * n);
-  column_major_matrix_view q2_mat_mn{a_cmplx, m};
+  column_major_matrix_view q2_mat_mn{a_cmplx.data(), m};
   for (int i = 0; i < n; i++) {
     for (int j = 0; j < n; j++) {
       q2_mat_mn(i, j) = q2_mat(i, j);
@@ -731,7 +731,7 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
                               const int n, std::vector<cf32_t> &a,
                               std::vector<float> &s, std::vector<cf32_t> &u,
                               std::vector<cf32_t> &vt) {
-  column_major_matrix_view a_mat{a, m};
+  column_major_matrix_view a_mat{a.data(), m};
 
   // Perform the QR factorization of A.
   std::vector<cf32_t> tau(n);
@@ -739,7 +739,7 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
 
   // Extract the R.
   std::vector<cf32_t> r(n * n);
-  column_major_matrix_view r_mat{r, n};
+  column_major_matrix_view r_mat{r.data(), n};
   for (int i = 0; i < n; i++) {
     for (int j = i; j < n; j++) {
       r_mat(i, j) = a_mat(i, j);
@@ -758,8 +758,8 @@ static inline int qr_svd_cf32(const bool gen_singular_vect, const int m,
     // Copy u1 in u
     // Initialise u to zero in case it is not.
     u.assign(u.size(), 0.0F);
-    column_major_matrix_view u_mat{u, m};
-    column_major_matrix_view u1_mat{u1, n};
+    column_major_matrix_view u_mat{u.data(), m};
+    column_major_matrix_view u1_mat{u1.data(), n};
     for (int i = 0; i < n; i++) {
       for (int j = 0; j < n; j++) {
         u_mat(i, j) = u1_mat(i, j);
@@ -786,7 +786,7 @@ static inline bool check_orthogonality(const int m, const int n, cf32_t *q) {
 
   // Build an identity matrix Id
   std::vector<cf32_t> a(n * n);
-  column_major_matrix_view a_mat{a, n};
+  column_major_matrix_view a_mat{a.data(), n};
   for (int i = 0; i < n; i++) {
     a_mat(i, i) = 1.0F;
   }
@@ -826,7 +826,7 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
   // Extract R, allocate m-by-n memory for
   // the multiplication by A later
   std::vector<cf32_t> r(m * n);
-  column_major_matrix_view r_mat{r, m};
+  column_major_matrix_view r_mat{r.data(), m};
   for (int i = 0; i < n; i++) {
     for (int j = i; j < n; j++) {
       r_mat(i, j) = a_mat(i, j);
@@ -838,7 +838,7 @@ static inline bool check_qr_decomposition(int m, int n, const cf32_t *aref,
   // Copy Aref
   std::vector<cf32_t> c(m * n);
   memcpy(c.data(), aref, m * n * sizeof(cf32_t));
-  column_major_matrix_view c_mat{c, m};
+  column_major_matrix_view c_mat{c.data(), m};
 
   // Compute Aref = Aref - QR
   for (int i = 0; i < m; i++) {
@@ -907,7 +907,7 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
   get_p(m, n, a, taup, p.data());
   // Build explicitly the n-by-n bidiagonal matrix B
   std::vector<cf32_t> b(n * n);
-  column_major_matrix_view b_mat{b, n};
+  column_major_matrix_view b_mat{b.data(), n};
   for (int i = 0; i < n - 1; i++) {
     b_mat(i, i) = d[i];
     b_mat(i, i + 1) = e[i];
@@ -924,7 +924,7 @@ static inline bool check_bidiag_decomposition(int m, int n, const cf32_t *aref,
 
   // Compute Aref - Q * B * VT
   column_major_matrix_view aref_mat{aref, m};
-  column_major_matrix_view c_mat{c, m};
+  column_major_matrix_view c_mat{c.data(), m};
   for (int i = 0; i < m; i++) {
     for (int j = 0; j < n; j++) {
       c_mat(i, j) -= aref_mat(i, j);
@@ -980,7 +980,7 @@ static inline bool check_svd_decomposition(int m, int n, const cf32_t *a,
   // U1 = U * S
   std::vector<cf32_t> u1(m * n);
   column_major_matrix_view u_mat{u, m};
-  column_major_matrix_view u1_mat{u1, m};
+  column_major_matrix_view u1_mat{u1.data(), m};
   for (int i = 0; i < m; i++) {
     for (int j = 0; j < n; j++) {
       u1_mat(i, j) = u_mat(i, j) * s[j];
diff --git a/test/Scrambling/main.cpp b/test/Scrambling/main.cpp
index ecb4014..777276f 100644
--- a/test/Scrambling/main.cpp
+++ b/test/Scrambling/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "bit_utils.hpp"
diff --git a/test/SeqGenerator/main.cpp b/test/SeqGenerator/main.cpp
index f7e3e49..8bb2f61 100644
--- a/test/SeqGenerator/main.cpp
+++ b/test/SeqGenerator/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/decoding/main.cpp b/test/Turbo/decoding/main.cpp
index a46ea21..027056e 100644
--- a/test/Turbo/decoding/main.cpp
+++ b/test/Turbo/decoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
@@ -104,7 +104,6 @@ int main(int argc, char **argv) {
   passed &= run_turbo_decoding_parameter_test();
 
   // Check decoder decodes correctly
-
   for (auto k : valid_ks) {
     passed &=
         run_turbo_decoding_test("TurboDecoding", k, armral_turbo_decode_block);
diff --git a/test/Turbo/encoding/main.cpp b/test/Turbo/encoding/main.cpp
index fcad110..b945f22 100644
--- a/test/Turbo/encoding/main.cpp
+++ b/test/Turbo/encoding/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 
diff --git a/test/Turbo/encoding/reference_turbo_encoder.hpp b/test/Turbo/encoding/reference_turbo_encoder.hpp
index d625df6..aec3668 100644
--- a/test/Turbo/encoding/reference_turbo_encoder.hpp
+++ b/test/Turbo/encoding/reference_turbo_encoder.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/rate_matching/main.cpp b/test/Turbo/rate_matching/main.cpp
index db8d2f6..be9c29f 100644
--- a/test/Turbo/rate_matching/main.cpp
+++ b/test/Turbo/rate_matching/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/main.cpp b/test/Turbo/rate_recovery/main.cpp
index b27b4c1..36c748b 100644
--- a/test/Turbo/rate_recovery/main.cpp
+++ b/test/Turbo/rate_recovery/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "armral.h"
 #include "int8_utils.hpp"
diff --git a/test/Turbo/rate_recovery/rate_recovery_data.hpp b/test/Turbo/rate_recovery/rate_recovery_data.hpp
index 7cec342..0117f14 100644
--- a/test/Turbo/rate_recovery/rate_recovery_data.hpp
+++ b/test/Turbo/rate_recovery/rate_recovery_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/Turbo/turbo_test_data.hpp b/test/Turbo/turbo_test_data.hpp
index 9efa245..cc47d69 100644
--- a/test/Turbo/turbo_test_data.hpp
+++ b/test/Turbo/turbo_test_data.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/test/VectorDotProd/vecDot16/main.cpp b/test/VectorDotProd/vecDot16/main.cpp
index 44b800a..0812631 100644
--- a/test/VectorDotProd/vecDot16/main.cpp
+++ b/test/VectorDotProd/vecDot16/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot16_2/main.cpp b/test/VectorDotProd/vecDot16_2/main.cpp
index 913f3a1..764b191 100644
--- a/test/VectorDotProd/vecDot16_2/main.cpp
+++ b/test/VectorDotProd/vecDot16_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_dot_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/VectorDotProd/vecDot16_2_32bit/main.cpp b/test/VectorDotProd/vecDot16_2_32bit/main.cpp
index 63a6fd2..8fa6efb 100644
--- a/test/VectorDotProd/vecDot16_2_32bit/main.cpp
+++ b/test/VectorDotProd/vecDot16_2_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
@@ -44,7 +44,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_dot_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/VectorDotProd/vecDot16_32bit/main.cpp b/test/VectorDotProd/vecDot16_32bit/main.cpp
index 30ae98b..f400649 100644
--- a/test/VectorDotProd/vecDot16_32bit/main.cpp
+++ b/test/VectorDotProd/vecDot16_32bit/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "qint64.hpp"
diff --git a/test/VectorDotProd/vecDot32/main.cpp b/test/VectorDotProd/vecDot32/main.cpp
index 7229866..beca6f0 100644
--- a/test/VectorDotProd/vecDot32/main.cpp
+++ b/test/VectorDotProd/vecDot32/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -29,7 +29,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_dot_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/VectorDotProd/vecDot32_2/main.cpp b/test/VectorDotProd/vecDot32_2/main.cpp
index ca50a15..b40850c 100644
--- a/test/VectorDotProd/vecDot32_2/main.cpp
+++ b/test/VectorDotProd/vecDot32_2/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cf32_utils.hpp"
 
@@ -38,7 +38,7 @@ int main(int argc, char **argv) {
       1, 2, 3, 4, 5, 7, 8, 15, 16, 32, 64, 100, 128, 151, 256, 512, 1024,
   };
   bool passed = true;
-  for (auto &n : params) {
+  for (const auto &n : params) {
     passed &= run_vec_dot_test(n);
   }
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
diff --git a/test/XRanBlockFloat/Compression/main.cpp b/test/XRanBlockFloat/Compression/main.cpp
index 14a9e54..43824bd 100644
--- a/test/XRanBlockFloat/Compression/main.cpp
+++ b/test/XRanBlockFloat/Compression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/test/XRanBlockFloat/Decompression/main.cpp b/test/XRanBlockFloat/Decompression/main.cpp
index e50737b..4506d3b 100644
--- a/test/XRanBlockFloat/Decompression/main.cpp
+++ b/test/XRanBlockFloat/Decompression/main.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #include "cs16_utils.hpp"
 #include "int8_utils.hpp"
diff --git a/utils/bit_utils.hpp b/utils/bit_utils.hpp
index fc85ac2..0de9b1c 100644
--- a/utils/bit_utils.hpp
+++ b/utils/bit_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/cf32_utils.hpp b/utils/cf32_utils.hpp
index 7e41eb1..41f2c4f 100644
--- a/utils/cf32_utils.hpp
+++ b/utils/cf32_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/cs16_utils.hpp b/utils/cs16_utils.hpp
index 0d64613..6824f3c 100644
--- a/utils/cs16_utils.hpp
+++ b/utils/cs16_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/fft_utils.hpp b/utils/fft_utils.hpp
index 5de9ef2..c99371c 100644
--- a/utils/fft_utils.hpp
+++ b/utils/fft_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/int8_utils.hpp b/utils/int8_utils.hpp
index 430baf0..ec5d103 100644
--- a/utils/int8_utils.hpp
+++ b/utils/int8_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/matrix_utils.hpp b/utils/matrix_utils.hpp
index 3b27451..d15e31e 100644
--- a/utils/matrix_utils.hpp
+++ b/utils/matrix_utils.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -8,20 +8,6 @@
 #include "reference_linalg.hpp"
 #include "rng.hpp"
 
-/*
- * Multiply a vector by a uniform scaling factor.
- *
- * This is explicitly noinline since it avoids a compiler bug with GCC 8.2.0
- * where the code is incorrectly inlined into gen_hermitian_matrix.
- */
-static inline void __attribute__((noinline))
-cscal(uint32_t n, armral_cmplx_f32_t *a, armral_cmplx_f32_t s) {
-  for (unsigned i = 0; i < n; ++i) {
-    a[i].re *= s.re;
-    a[i].im *= s.im;
-  }
-}
-
 /*
  * Generate random values, the resulting matrix will have linearly independent
  * columns with probability almost 1.
@@ -261,8 +247,8 @@ static inline bool check_results_identity(const armral_cmplx_f32_t *mat,
                                           uint32_t m, int verbose = 0) {
   bool passed = true;
   // Init arrays
-  armral_cmplx_f32_t id[m * m];
-  armral_cmplx_f32_t mm[m * m];
+  std::vector<armral_cmplx_f32_t> id(m * m);
+  std::vector<armral_cmplx_f32_t> mm(m * m);
   for (unsigned i = 0; i < m; ++i) {
     for (unsigned j = 0; j < m; ++j) {
       if (i == j) {
@@ -280,248 +266,23 @@ static inline bool check_results_identity(const armral_cmplx_f32_t *mat,
   {
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, m64, inv_m64, 0.0F, mm64);
-    convert_vector_to_cf32_array(m * m, mm64, mm);
-    passed &= check_results_mat_inv("MM^{-1} - Id", (float *)mm, (float *)id,
-                                    2 * m * m, (float)m, (float)m, verbose);
+    convert_vector_to_cf32_array(m * m, mm64, mm.data());
+    passed &= check_results_mat_inv("MM^{-1} - Id", (float *)mm.data(),
+                                    (float *)id.data(), 2 * m * m, (float)m,
+                                    (float)m, verbose);
   }
   // MM^{-1}
   {
     std::vector<std::complex<double>> mm64(m * m);
     reference_zgemm(m, m, m, 1.0F, inv_m64, m64, 0.0F, mm64);
-    convert_vector_to_cf32_array(m * m, mm64, mm);
-    passed &= check_results_mat_inv("M^{-1}M - Id", (float *)mm, (float *)id,
-                                    2 * m * m, (float)m, (float)m, verbose);
+    convert_vector_to_cf32_array(m * m, mm64, mm.data());
+    passed &= check_results_mat_inv("M^{-1}M - Id", (float *)mm.data(),
+                                    (float *)id.data(), 2 * m * m, (float)m,
+                                    (float)m, verbose);
   }
   return passed;
 }
 
-/*
- * Reorder matrices to allow easy access to blocks.
- */
-static unsigned zorder_y_of(unsigned index) {
-  unsigned y = 0;
-  for (unsigned b = 0, k = 0; (1U << b) <= index; b += 2, k++) {
-    y += static_cast<unsigned>((index & (1U << b)) != 0) << k;
-  }
-  return y;
-}
-
-static unsigned zorder_x_of(unsigned index) {
-  return zorder_y_of(index >> 1);
-}
-
-/*
- * Convert from z-order to row-major.
- */
-static std::vector<std::complex<double>>
-zorder_to_rowmajor(uint32_t m, const std::vector<std::complex<double>> &z) {
-  std::vector<std::complex<double>> a(m * m);
-  for (unsigned i = 0; i < m; ++i) {
-    for (unsigned j = 0; j < m; ++j) {
-      unsigned ijx = zorder_x_of(i * m + j);
-      unsigned ijy = zorder_y_of(i * m + j);
-      a[ijx * m + ijy] = z[i * m + j];
-    }
-  }
-  return a;
-}
-
-/*
- * Convert from row-major to z-order.
- */
-static std::vector<std::complex<double>>
-rowmajor_to_zorder(uint32_t m, const std::vector<std::complex<double>> &a) {
-  std::vector<std::complex<double>> z(m * m);
-  for (unsigned i = 0; i < m; ++i) {
-    for (unsigned j = 0; j < m; ++j) {
-      unsigned ijx = zorder_x_of(i * m + j);
-      unsigned ijy = zorder_y_of(i * m + j);
-      z[i * m + j] = a[ijx * m + ijy];
-    }
-  }
-  return z;
-}
-
-/*
- * General matrix multiplication on matrices stored in z-order.
- */
-static void reference_zgemm_zorder(uint32_t m, const double alpha,
-                                   const std::vector<std::complex<double>> &a,
-                                   const std::vector<std::complex<double>> &b,
-                                   const double beta,
-                                   std::vector<std::complex<double>> &c) {
-  // Convert to row-major
-  auto a64 = zorder_to_rowmajor(m, a);
-  auto b64 = zorder_to_rowmajor(m, b);
-  auto c64 = zorder_to_rowmajor(m, c);
-
-  // Evaluate double precision matrix multiply
-  reference_zgemm(m, m, m, alpha, a64, b64, beta, c64);
-
-  // Convert back to original order
-  c = rowmajor_to_zorder(m, c64);
-}
-
-static std::vector<std::complex<double>>
-reference_zgeinv_2x2(uint32_t m, const std::vector<std::complex<double>> &mat) {
-  std::vector<std::complex<double>> inv_m(m * m);
-  // Inverse 2x2 matrix using analytic expression
-  std::complex<double> rdet = 1.0 / (mat[0] * mat[3] - mat[1] * mat[2]);
-  inv_m[0] = +rdet * mat[3];
-  inv_m[1] = -rdet * mat[1];
-  inv_m[2] = -rdet * mat[2];
-  inv_m[3] = +rdet * mat[0];
-  return inv_m;
-}
-
-static std::vector<std::complex<double>>
-reference_zgeinv_3x3(uint32_t m, const std::vector<std::complex<double>> &mat) {
-  std::vector<std::complex<double>> inv_m(m * m);
-  auto a0 = mat[0];
-  auto a1 = mat[1];
-  auto a2 = mat[2];
-  auto a3 = mat[4];
-  auto a4 = mat[5];
-  auto a5 = mat[8];
-
-  auto c1 = mat[3];
-  auto c2 = mat[6];
-  auto c4 = mat[7];
-
-  auto adj00 = a3 * a5 - a4 * c4;
-  auto adj11 = a0 * a5 - a2 * c2;
-  auto adj22 = a0 * a3 - a1 * c1;
-
-  auto adj10 = c1 * a5 - c2 * a4;
-  auto adj20 = c1 * c4 - c2 * a3;
-  auto adj01 = a1 * a5 - c4 * a2;
-  auto adj21 = a0 * c4 - c2 * a1;
-  auto adj02 = a1 * a4 - a3 * a2;
-  auto adj12 = a0 * a4 - c1 * a2;
-
-  // Compute cofactors (apply negative signs)
-  adj01 = -adj01;
-  adj10 = -adj10;
-  adj12 = -adj12;
-  adj21 = -adj21;
-
-  // Determinant: A_{0:} * adj(A)_{:0}
-  auto inv_det = 1.0 / (a0 * adj00 + a1 * adj10 + a2 * adj20);
-
-  // Write into output array
-  inv_m[0] = adj00 * inv_det;
-  inv_m[1] = adj01 * inv_det;
-  inv_m[2] = adj02 * inv_det;
-  inv_m[3] = adj10 * inv_det;
-  inv_m[4] = adj11 * inv_det;
-  inv_m[5] = adj12 * inv_det;
-  inv_m[6] = adj20 * inv_det;
-  inv_m[7] = adj21 * inv_det;
-  inv_m[8] = adj22 * inv_det;
-  return inv_m;
-}
-
-/*
- * Matrix Inversion using blockwise approach (recursive implementation).
- *
- * M = [A B]   M^{-1} = [X^{-1}         -A^{-1}BU^{-1}]
- *     [C D]            [-D^{-1}CX^{-1} U^{-1}        ]
- */
-static std::vector<std::complex<double>>
-reference_zgeinv(uint32_t m, const std::vector<std::complex<double>> &mat) {
-  if (m == 2) {
-    return reference_zgeinv_2x2(m, mat);
-  }
-  std::vector<std::complex<double>> inv_m(m * m);
-  // Compute each block separately using reference matrix inversion (recursive
-  // process)
-  unsigned mm = m / 2;
-  std::vector<std::complex<double>> a(mat.begin() + 0 * mm * mm,
-                                      mat.begin() + 1 * mm * mm);
-  std::vector<std::complex<double>> b(mat.begin() + 1 * mm * mm,
-                                      mat.begin() + 2 * mm * mm);
-  std::vector<std::complex<double>> c(mat.begin() + 2 * mm * mm,
-                                      mat.begin() + 3 * mm * mm);
-  std::vector<std::complex<double>> d(mat.begin() + 3 * mm * mm,
-                                      mat.begin() + 4 * mm * mm);
-
-  // Inverse of A and D
-  auto inv_a = reference_zgeinv(mm, a);
-  auto inv_d = reference_zgeinv(mm, d);
-
-  // M00^{-1} = X^{-1} = (A - BD^{-1}C)^{-1}
-  std::vector<std::complex<double>> y(mm * mm);
-  auto x = a;
-  reference_zgemm_zorder(mm, 1.0, inv_d, c, 0.0, y);
-  reference_zgemm_zorder(mm, -1.0, b, y, 1.0, x);
-  auto inv_m00 = reference_zgeinv(mm, x);
-
-  // M10^{-1} = -D^{-1}C X^{-1}
-  std::vector<std::complex<double>> inv_m10(mm * mm);
-  reference_zgemm_zorder(mm, -1.0, y, inv_m00, 0.0, inv_m10);
-
-  // M11^{-1} = U^{-1} = (D - CA^{-1}B)^{-1}
-  std::vector<std::complex<double>> v(mm * mm);
-  auto u = d;
-  reference_zgemm_zorder(mm, 1.0, inv_a, b, 0.0, v);
-  reference_zgemm_zorder(mm, -1.0, c, v, 1.0, u);
-  auto inv_m11 = reference_zgeinv(mm, u);
-
-  // M01 = -A^{-1}B U^{-1}
-  std::vector<std::complex<double>> inv_m01(mm * mm);
-  reference_zgemm_zorder(mm, -1.0, v, inv_m11, 0.0, inv_m01);
-
-  // Set inverse matrix block per block
-  inv_m.clear();
-  inv_m.insert(inv_m.end(), inv_m00.begin(), inv_m00.end());
-  inv_m.insert(inv_m.end(), inv_m01.begin(), inv_m01.end());
-  inv_m.insert(inv_m.end(), inv_m10.begin(), inv_m10.end());
-  inv_m.insert(inv_m.end(), inv_m11.begin(), inv_m11.end());
-
-  return inv_m;
-}
-
-static inline std::vector<std::complex<double>>
-reference_zgeinv_small(uint32_t m,
-                       const std::vector<std::complex<double>> &mat) {
-  if (m == 2) {
-    return reference_zgeinv_2x2(m, mat);
-  }
-  if (m == 3) {
-    return reference_zgeinv_3x3(m, mat);
-  }
-  // GCOVR_EXCL_START
-  assert(false && "Small matrix inverse only defined for m = 2 or m = 3");
-  // GCOVR_EXCL_STOP
-  return {};
-}
-
-/*
- * Run reference Matrix Inversion based on blockwise approach.
- */
-static inline void reference_matinv_block(uint32_t m,
-                                          const armral_cmplx_f32_t *a,
-                                          armral_cmplx_f32_t *b) {
-
-  // Init double precision input matrix (use z-order for easy access to blocks)
-  auto a_tmp = convert_cf32_array_to_vector<double>(m * m, a);
-
-  // Bypass z-ordering for small cases
-  if (m == 2 || m == 3) {
-    auto b_tmp = reference_zgeinv_small(m, a_tmp);
-    convert_vector_to_cf32_array(m * m, b_tmp, b);
-  } else {
-    auto a64 = rowmajor_to_zorder(m, a_tmp);
-
-    // Evaluate double precision inverse
-    auto b64 = reference_zgeinv(m, a64);
-
-    // Round back to single precision
-    auto b_tmp = zorder_to_rowmajor(m, b64);
-    convert_vector_to_cf32_array(m * m, b_tmp, b);
-  }
-}
-
 /*
  * Unpack data from batched format into a contiguous array
  */
@@ -566,3 +327,24 @@ static inline void print_cmplx_mat(const std::string &ref, uint32_t m,
     printf("]\n");
   }
 }
+
+/*
+ * Return the number of floating-point operations required to calculate a length-n
+ * complex dot product
+ */
+static inline uint32_t cmplx_dot_nflops(uint32_t n) {
+  // A complex multiplication requires 6 floating-point operations
+  uint32_t op_mul = 6;
+  // A complex multiply-accumulate requires 8 floating-point operations
+  uint32_t op_mla = 8;
+
+  uint32_t nflops = 0;
+  if (n > 0) {
+    // The cost of multiplying the first two vector entries together
+    nflops += op_mul;
+    // The cost of multiplying the remaining (n-1) vector entries
+    // and accumulating into the dot product
+    nflops += (n - 1) * op_mla;
+  }
+  return nflops;
+}
diff --git a/utils/qint64.hpp b/utils/qint64.hpp
index efb5aba..8922edf 100644
--- a/utils/qint64.hpp
+++ b/utils/qint64.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
diff --git a/utils/reference_linalg.hpp b/utils/reference_linalg.hpp
index 0a05f10..0960d44 100644
--- a/utils/reference_linalg.hpp
+++ b/utils/reference_linalg.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 #pragma once
 
@@ -12,6 +12,20 @@
 #include <complex>
 #include <vector>
 
+/*
+ * Multiply a vector by a uniform scaling factor.
+ *
+ * This is explicitly noinline since it avoids a compiler bug with GCC 8.2.0
+ * where the code is incorrectly inlined into gen_hermitian_matrix.
+ */
+static inline void __attribute__((noinline))
+cscal(uint32_t n, armral_cmplx_f32_t *a, armral_cmplx_f32_t s) {
+  for (unsigned i = 0; i < n; ++i) {
+    a[i].re *= s.re;
+    a[i].im *= s.im;
+  }
+}
+
 /*
  * ZGEMM: General complex double matrix multiplication C = beta*C + alpha*A*B
  */
@@ -31,6 +45,207 @@ static inline void reference_zgemm(uint16_t m, uint16_t n, uint16_t p,
   }
 }
 
+/*
+ * Reorder matrices to allow easy access to blocks.
+ */
+static unsigned zorder_y_of(unsigned index) {
+  unsigned y = 0;
+  for (unsigned b = 0, k = 0; (1U << b) <= index; b += 2, k++) {
+    y += static_cast<unsigned>((index & (1U << b)) != 0) << k;
+  }
+  return y;
+}
+
+static unsigned zorder_x_of(unsigned index) {
+  return zorder_y_of(index >> 1);
+}
+
+/*
+ * Convert from z-order to row-major.
+ */
+static std::vector<std::complex<double>>
+zorder_to_rowmajor(uint32_t m, const std::vector<std::complex<double>> &z) {
+  std::vector<std::complex<double>> a(m * m);
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      unsigned ijx = zorder_x_of(i * m + j);
+      unsigned ijy = zorder_y_of(i * m + j);
+      a[ijx * m + ijy] = z[i * m + j];
+    }
+  }
+  return a;
+}
+
+/*
+ * Convert from row-major to z-order.
+ */
+static std::vector<std::complex<double>>
+rowmajor_to_zorder(uint32_t m, const std::vector<std::complex<double>> &a) {
+  std::vector<std::complex<double>> z(m * m);
+  for (unsigned i = 0; i < m; ++i) {
+    for (unsigned j = 0; j < m; ++j) {
+      unsigned ijx = zorder_x_of(i * m + j);
+      unsigned ijy = zorder_y_of(i * m + j);
+      z[i * m + j] = a[ijx * m + ijy];
+    }
+  }
+  return z;
+}
+
+/*
+ * General matrix multiplication on matrices stored in z-order.
+ */
+static void reference_zgemm_zorder(uint32_t m, const double alpha,
+                                   const std::vector<std::complex<double>> &a,
+                                   const std::vector<std::complex<double>> &b,
+                                   const double beta,
+                                   std::vector<std::complex<double>> &c) {
+  // Convert to row-major
+  auto a64 = zorder_to_rowmajor(m, a);
+  auto b64 = zorder_to_rowmajor(m, b);
+  auto c64 = zorder_to_rowmajor(m, c);
+
+  // Evaluate double precision matrix multiply
+  reference_zgemm(m, m, m, alpha, a64, b64, beta, c64);
+
+  // Convert back to original order
+  c = rowmajor_to_zorder(m, c64);
+}
+
+static std::vector<std::complex<double>>
+reference_zgeinv_2x2(uint32_t m, const std::vector<std::complex<double>> &mat) {
+  std::vector<std::complex<double>> inv_m(m * m);
+  // Inverse 2x2 matrix using analytic expression
+  std::complex<double> rdet = 1.0 / (mat[0] * mat[3] - mat[1] * mat[2]);
+  inv_m[0] = +rdet * mat[3];
+  inv_m[1] = -rdet * mat[1];
+  inv_m[2] = -rdet * mat[2];
+  inv_m[3] = +rdet * mat[0];
+  return inv_m;
+}
+
+static std::vector<std::complex<double>>
+reference_zgeinv_3x3(uint32_t m, const std::vector<std::complex<double>> &mat) {
+  std::vector<std::complex<double>> inv_m(m * m);
+  auto a0 = mat[0];
+  auto a1 = mat[1];
+  auto a2 = mat[2];
+  auto a3 = mat[4];
+  auto a4 = mat[5];
+  auto a5 = mat[8];
+
+  auto c1 = mat[3];
+  auto c2 = mat[6];
+  auto c4 = mat[7];
+
+  auto adj00 = a3 * a5 - a4 * c4;
+  auto adj11 = a0 * a5 - a2 * c2;
+  auto adj22 = a0 * a3 - a1 * c1;
+
+  auto adj10 = c1 * a5 - c2 * a4;
+  auto adj20 = c1 * c4 - c2 * a3;
+  auto adj01 = a1 * a5 - c4 * a2;
+  auto adj21 = a0 * c4 - c2 * a1;
+  auto adj02 = a1 * a4 - a3 * a2;
+  auto adj12 = a0 * a4 - c1 * a2;
+
+  // Compute cofactors (apply negative signs)
+  adj01 = -adj01;
+  adj10 = -adj10;
+  adj12 = -adj12;
+  adj21 = -adj21;
+
+  // Determinant: A_{0:} * adj(A)_{:0}
+  auto inv_det = 1.0 / (a0 * adj00 + a1 * adj10 + a2 * adj20);
+
+  // Write into output array
+  inv_m[0] = adj00 * inv_det;
+  inv_m[1] = adj01 * inv_det;
+  inv_m[2] = adj02 * inv_det;
+  inv_m[3] = adj10 * inv_det;
+  inv_m[4] = adj11 * inv_det;
+  inv_m[5] = adj12 * inv_det;
+  inv_m[6] = adj20 * inv_det;
+  inv_m[7] = adj21 * inv_det;
+  inv_m[8] = adj22 * inv_det;
+  return inv_m;
+}
+
+/*
+ * Matrix Inversion using blockwise approach (recursive implementation).
+ *
+ * M = [A B]   M^{-1} = [X^{-1}         -A^{-1}BU^{-1}]
+ *     [C D]            [-D^{-1}CX^{-1} U^{-1}        ]
+ */
+static std::vector<std::complex<double>>
+reference_zgeinv(uint32_t m, const std::vector<std::complex<double>> &mat) {
+  if (m == 2) {
+    return reference_zgeinv_2x2(m, mat);
+  }
+  std::vector<std::complex<double>> inv_m(m * m);
+  // Compute each block separately using reference matrix inversion (recursive
+  // process)
+  unsigned mm = m / 2;
+  std::vector<std::complex<double>> a(mat.begin() + 0 * mm * mm,
+                                      mat.begin() + 1 * mm * mm);
+  std::vector<std::complex<double>> b(mat.begin() + 1 * mm * mm,
+                                      mat.begin() + 2 * mm * mm);
+  std::vector<std::complex<double>> c(mat.begin() + 2 * mm * mm,
+                                      mat.begin() + 3 * mm * mm);
+  std::vector<std::complex<double>> d(mat.begin() + 3 * mm * mm,
+                                      mat.begin() + 4 * mm * mm);
+
+  // Inverse of A and D
+  auto inv_a = reference_zgeinv(mm, a);
+  auto inv_d = reference_zgeinv(mm, d);
+
+  // M00^{-1} = X^{-1} = (A - BD^{-1}C)^{-1}
+  std::vector<std::complex<double>> y(mm * mm);
+  auto x = a;
+  reference_zgemm_zorder(mm, 1.0, inv_d, c, 0.0, y);
+  reference_zgemm_zorder(mm, -1.0, b, y, 1.0, x);
+  auto inv_m00 = reference_zgeinv(mm, x);
+
+  // M10^{-1} = -D^{-1}C X^{-1}
+  std::vector<std::complex<double>> inv_m10(mm * mm);
+  reference_zgemm_zorder(mm, -1.0, y, inv_m00, 0.0, inv_m10);
+
+  // M11^{-1} = U^{-1} = (D - CA^{-1}B)^{-1}
+  std::vector<std::complex<double>> v(mm * mm);
+  auto u = d;
+  reference_zgemm_zorder(mm, 1.0, inv_a, b, 0.0, v);
+  reference_zgemm_zorder(mm, -1.0, c, v, 1.0, u);
+  auto inv_m11 = reference_zgeinv(mm, u);
+
+  // M01 = -A^{-1}B U^{-1}
+  std::vector<std::complex<double>> inv_m01(mm * mm);
+  reference_zgemm_zorder(mm, -1.0, v, inv_m11, 0.0, inv_m01);
+
+  // Set inverse matrix block per block
+  inv_m.clear();
+  inv_m.insert(inv_m.end(), inv_m00.begin(), inv_m00.end());
+  inv_m.insert(inv_m.end(), inv_m01.begin(), inv_m01.end());
+  inv_m.insert(inv_m.end(), inv_m10.begin(), inv_m10.end());
+  inv_m.insert(inv_m.end(), inv_m11.begin(), inv_m11.end());
+
+  return inv_m;
+}
+
+static inline std::vector<std::complex<double>>
+reference_zgeinv_small(uint32_t m,
+                       const std::vector<std::complex<double>> &mat) {
+  if (m == 2) {
+    return reference_zgeinv_2x2(m, mat);
+  }
+  if (m == 3) {
+    return reference_zgeinv_3x3(m, mat);
+  }
+  // GCOVR_EXCL_START
+  assert(false && "Small matrix inverse only defined for m = 2 or m = 3");
+  // GCOVR_EXCL_STOP
+  return {};
+}
+
 /*
  * Converting between armral_cmplx_f32_t and std::complex<double/float>
  */
@@ -117,6 +332,26 @@ static inline void reference_matmul_cf32(uint16_t m, uint16_t n, uint16_t p,
   }
 }
 
+/*
+ * Reference conjugate transpose matrix multiplication (C=B * A^H) on cf32 input
+ * matrices
+ */
+static inline void reference_matmul_bah_cf32(
+    uint16_t m, uint16_t n, const armral_cmplx_f32_t *__restrict p_src_a,
+    const armral_cmplx_f32_t *__restrict p_src_b, armral_cmplx_f32_t *p_dst) {
+  for (uint16_t i = 0; i < n; i++) {
+    for (uint16_t j = 0; j < m; j++) {
+      std::complex<double> dot = 0.;
+      for (uint16_t k = 0; k < n; k++) {
+        auto ah_jk = complex_convert<double>(p_src_a[j * n + k]);
+        auto b_ik = complex_convert<double>(p_src_b[i * n + k]);
+        dot += b_ik * std::conj(ah_jk);
+      }
+      p_dst[i * m + j] = complex_convert(dot);
+    }
+  }
+}
+
 /*
  * Reference conjugate transpose matrix multiplication (C=A^H * B) on cf32 input
  * matrices
@@ -179,22 +414,51 @@ reference_matmul_aah_cf32(uint16_t m, uint16_t n,
 }
 
 /*
- * Return the number of floating-point operatins required to calculate a length-n
- * complex dot product
+ * Reference matrix multiplication (C=A^H*A) on a cf32 input matrix
  */
-static inline uint32_t cmplx_dot_nflops(uint32_t n) {
-  // A complex multiplication requires 6 floating-point operations
-  uint32_t op_mul = 6;
-  // A complex multiply-accumulate requires 8 floating-point operations
-  uint32_t op_mla = 8;
+static inline void
+reference_matmul_aha_cf32(uint16_t m, uint16_t n,
+                          const armral_cmplx_f32_t *__restrict p_src,
+                          armral_cmplx_f32_t *p_dst) {
+  for (uint16_t i = 0; i < n; i++) {
+    for (uint16_t j = 0; j < n; j++) {
+      std::complex<double> dot = 0.;
+      for (uint16_t k = 0; k < m; k++) {
+        uint32_t ah_idx = k * n + i;
+        uint32_t a_idx = k * n + j;
+        dot += std::conj(complex_convert<double>(p_src[ah_idx])) *
+               complex_convert<double>(p_src[a_idx]);
+      }
+      if (i == j) {
+        dot.imag(0.);
+      }
+      p_dst[i * n + j] = complex_convert(dot);
+    }
+  }
+}
+
+/*
+ * Run reference Matrix Inversion based on blockwise approach.
+ */
+static inline void reference_matinv_block(uint32_t m,
+                                          const armral_cmplx_f32_t *a,
+                                          armral_cmplx_f32_t *b) {
+
+  // Init double precision input matrix (use z-order for easy access to blocks)
+  auto a_tmp = convert_cf32_array_to_vector<double>(m * m, a);
+
+  // Bypass z-ordering for small cases
+  if (m == 2 || m == 3) {
+    auto b_tmp = reference_zgeinv_small(m, a_tmp);
+    convert_vector_to_cf32_array(m * m, b_tmp, b);
+  } else {
+    auto a64 = rowmajor_to_zorder(m, a_tmp);
+
+    // Evaluate double precision inverse
+    auto b64 = reference_zgeinv(m, a64);
 
-  uint32_t nflops = 0;
-  if (n > 0) {
-    // The cost of multiplying the first two vector entries together
-    nflops += op_mul;
-    // The cost of multiplying the remaining (n-1) vector entries
-    // and accumulating into the dot product
-    nflops += (n - 1) * op_mla;
+    // Round back to single precision
+    auto b_tmp = zorder_to_rowmajor(m, b64);
+    convert_vector_to_cf32_array(m * m, b_tmp, b);
   }
-  return nflops;
 }
diff --git a/utils/rng.cpp b/utils/rng.cpp
index 7b16c72..e14d97d 100644
--- a/utils/rng.cpp
+++ b/utils/rng.cpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #include "rng.hpp"
diff --git a/utils/rng.hpp b/utils/rng.hpp
index c94d4e4..a6e09dc 100644
--- a/utils/rng.hpp
+++ b/utils/rng.hpp
@@ -1,6 +1,6 @@
 /*
     Arm RAN Acceleration Library
-    Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 */
 
 #pragma once
-- 
GitLab