diff --git a/CREDITS.md b/CREDITS.md
index 9d655731b27cd29a3715e75b58f09d69839183ff..b5287fd24ecea884fb8108f90e207b1b390cf76c 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -2,6 +2,21 @@ In addition to the primary development being done by Arm, the
 following people and organizations have contributed to Arm RAN
 Acceleration Library:
 
+- The following Google Highway implementations have been updated:
+    `highway/arm_ldpc_decoder.cpp`,
+    have been contributed by Cambridge Consultants. See
+    <https://gitlab.arm.com/networking/ral/-/merge_requests/34>.
+
+- The following Google Highway implementations:
+    `highway/arm_turbo_decoder_batch.hpp`,
+    `highway/arm_turbo_decoder_single.hpp`,
+    `highway/arm_polar_decoder.cpp`,
+    `highway/arm_polar_encoder.cpp`,
+    `highway/arm_polar_frozen_bits.cpp`,
+    `highway/fft_*`
+    have been contributed by Cambridge Consultants. See
+    <https://gitlab.arm.com/networking/ral/-/merge_requests/33>.
+
 - The following Google Highway implementations:
     `highway/arm_scrambling.cpp`,
     `highway/arm_mat_seq_generator.cpp`,
diff --git a/armral_hwy.cmake b/armral_hwy.cmake
index 58221bb6db9a86d017a6777fb75b40d4c7b5be26..e7f92a530ce8b1d6516d47e79d9d9a33d5280b8e 100644
--- a/armral_hwy.cmake
+++ b/armral_hwy.cmake
@@ -88,6 +88,16 @@ set_property(
   PROPERTY COMPILE_DEFINITIONS
            HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE)
 
+# The LDPC decoder requires [S]VQRDMULH instruction which is only available
+# under NEON and SVE2 on aarch64, therefore, we have disabled SVE for all Arm
+# platforms when VQRDMULH is required; to avoid falling back to (slower) generic
+# implementations.
+set_property(
+  SOURCE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE_256|HWY_SVE)
+
 # The Turbo decoder implementation does not support scalable vectors and is
 # memory access heavy. The overhead of implicit masking when using fixed 128-bit
 # vectors causes a ~60% overhead. Benchmarks include the batch and single header
@@ -193,8 +203,8 @@ set(ARMRAL_LIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp
diff --git a/include/armral.h b/include/armral.h
index 434f980951fcac57d74d58467c385779bc3e3910..5c9810af6e8084a0c3d8ba5310c15c7eae04cb73 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -95,8 +95,8 @@
 #pragma GCC diagnostic pop
 #endif
 #else
-// GCC sometimes complains about declaration shadowing members in arm_neon-inl.h.
-// nothing we can do about that, so ignore it!
+// GCC sometimes complains about declaration shadowing members in
+// arm_neon-inl.h. nothing we can do about that, so ignore it!
 #ifndef __clang__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp
index 284cbc2ada90147ffaa6ee34c054db43cc963dc1..a5c8f9a961e5af35f40e84f80f479f4e3241150d 100644
--- a/src/UpperPHY/CRC/highway/crc_common.hpp
+++ b/src/UpperPHY/CRC/highway/crc_common.hpp
@@ -11,8 +11,8 @@
 
 namespace hn = hwy::HWY_NAMESPACE;
 
-// Allow compilation on non-arm architectures by aliasing poly64_t to an existing type
-// Test if arm_neon.h has been included
+// Allow compilation on non-arm architectures by aliasing poly64_t to an
+// existing type Test if arm_neon.h has been included
 #ifndef _AARCH64_NEON_H_
 using poly64_t = uint64_t;
 #endif
@@ -23,7 +23,8 @@ using poly64_t = uint64_t;
 // vaddq_p64 = Xor
 // vrev64q_u8 = Reverse8
 // vld1q_p64 = LoadU
-// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced pointer
+// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced
+// pointer
 
 template<char Endianness>
 static inline Vec_u64x2 load_p64x2(const uint64_t *p_in) {
diff --git a/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07ec726b5fc4ad09d91af42e2672cd6a7f63bc79
--- /dev/null
+++ b/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp
@@ -0,0 +1,812 @@
+/*
+    Arm RAN Acceleration Library
+    SPDX-FileCopyrightText: <text>Copyright 2020-2025 Arm Limited and/or its
+    affiliates <open-source-office@arm.com></text>
+    SPDX-License-Identifier: BSD-3-Clause
+*/
+
+#include "../ldpc_coding.hpp"
+#include "armral.h"
+#include "utils/allocators.hpp"
+#include "utils/bits_to_bytes.hpp"
+
+#include <hwy/highway.h>
+#include <utils/hwy_types.hpp>
+
+#include <cmath>
+#include <cstring>
+#include <optional>
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+namespace armral::ldpc {
+
+// Check nodes process the received information, update it, and send it back to
+// the connected variable nodes.
+//  l is updated belief.
+//  r is extrinsic information.
+//  min values, signs and sign products are passed as input argument to update
+//  the belief and store the extrinsic information.
+HWY_FORCED_INLINE void
+update_l_and_r(int16_t *__restrict__ l, int16_t *__restrict__ r,
+               const armral_ldpc_base_graph_t *graph, uint16_t z, uint32_t lsi,
+               uint16_t layer, const int16_t *__restrict__ row_min1_array,
+               const int16_t *__restrict__ row_min2_array,
+               const int16_t *__restrict__ row_sign_array,
+               const uint16_t *__restrict__ row_pos_array,
+               const int16_t *__restrict__ sign_scratch,
+               uint32_t *__restrict__ r_index) {
+
+  const uint32_t *col_indices;
+  uint32_t i;
+  uint32_t j;
+  uint32_t r_i = *r_index;
+  uint32_t num_lanes = hn::Lanes(di16x8);
+
+  i = graph->row_start_inds[layer];
+  // Get the number of nonzero entries in the row
+  j = graph->row_start_inds[layer + 1] - i;
+  col_indices = graph->col_inds + i;
+  const uint32_t *shift_ptr = graph->shifts + i * 8 + lsi * j;
+
+  const int16_t *sgn_scratch_buf = sign_scratch;
+
+  // for each column i.e only non -1's
+  for (uint16_t col = 0; col < j; col++) {
+    uint32_t col_block = col_indices[col];
+
+    int16_t *ptr_r = &r[r_i * z];
+    uint32_t shift = shift_ptr[col] % z;
+
+    const int16_t *min1_buf = row_min1_array;
+    const int16_t *min2_buf = row_min2_array;
+    const int16_t *sgn_buf = row_sign_array; // set to 0
+    const uint16_t *pos_buf = row_pos_array;
+
+    Vec_u16x8 pos_current = hn::Set(du16x8, col);
+
+    uint32_t blk1 = (z - shift) / num_lanes;
+    uint32_t blk2 = shift / num_lanes;
+    uint32_t tail1 = (z - shift) & (num_lanes - 1);
+    uint32_t tail2 = (shift) & (num_lanes - 1);
+    Mask_i16x8 pg_tail1 = hn::FirstN(di16x8, tail1);
+    Mask_i16x8 pg_tail2 = hn::FirstN(di16x8, tail2);
+
+    // Loop over z
+    // shift to z-1
+    int16_t *ptr_l = &l[col_block * z + shift]; // Input,point to shift3
+
+    for (uint32_t v_cnt = 0; v_cnt < blk1; v_cnt++) {
+      Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf);
+      Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf);
+      Vec_u16x8 pos = hn::LoadU(du16x8, pos_buf);
+
+      // check if this the column matching position for the min1
+      Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current));
+
+      // if yes replace min1 with min2, otherwise min1
+      Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1);
+
+      // apply sign
+      Vec_i16x8 signs = hn::LoadU(di16x8, sgn_scratch_buf);
+      merged_mins = hn::Mul(merged_mins, signs);
+
+      // apply sign product
+      Vec_i16x8 sign_prod = hn::LoadU(di16x8, sgn_buf);
+      merged_mins = hn::Mul(merged_mins, sign_prod);
+
+      // update r
+      hn::StoreU(merged_mins, di16x8, ptr_r);
+
+      // update l
+      Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l);
+      llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins);
+      hn::StoreU(llrs_reg, di16x8, ptr_l);
+
+      ptr_l += num_lanes;
+      ptr_r += num_lanes;
+      sgn_scratch_buf += num_lanes;
+      sgn_buf += num_lanes;
+      min1_buf += num_lanes;
+      min2_buf += num_lanes;
+      pos_buf += num_lanes;
+    }
+
+    if (tail1 > 0U) {
+      Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min1_buf);
+      Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min2_buf);
+      Vec_u16x8 pos = no_sanitize::MaskedLoad(hn::RebindMask(du16x8, pg_tail1),
+                                              du16x8, pos_buf);
+      // check if this the column matching position for the min1
+      Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current));
+
+      // if yes replace min1 with min2, otherwise min1
+      Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1);
+
+      // apply sign
+      Vec_i16x8 signs =
+          no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_scratch_buf);
+      merged_mins = hn::Mul(merged_mins, signs);
+
+      // apply sign product
+      Vec_i16x8 sign_prod = no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_buf);
+      merged_mins = hn::Mul(merged_mins, sign_prod);
+
+      // update r
+      hn::StoreN(merged_mins, di16x8, ptr_r, tail1);
+
+      // update l
+      Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_l);
+      llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins);
+      hn::StoreN(llrs_reg, di16x8, ptr_l, tail1);
+      ptr_l += tail1;
+      ptr_r += tail1;
+      sgn_scratch_buf += tail1;
+      sgn_buf += tail1;
+      min1_buf += tail1;
+      min2_buf += tail1;
+      pos_buf += tail1;
+    }
+
+    // 0 to shift-1
+    ptr_l = &l[col_block * z]; // point to start
+    for (uint32_t v_cnt = 0; v_cnt < blk2; v_cnt++) {
+
+      Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf);
+      Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf);
+      Vec_u16x8 pos = hn::LoadU(du16x8, pos_buf);
+
+      // check if this the column matching position for the min1
+      Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current));
+
+      // if yes replace min1 with min2, otherwise min1
+      Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1);
+
+      // apply sign
+      Vec_i16x8 signs = hn::LoadU(di16x8, sgn_scratch_buf);
+      merged_mins = hn::Mul(merged_mins, signs);
+
+      // apply sign product
+      Vec_i16x8 sign_prod = hn::LoadU(di16x8, sgn_buf);
+      merged_mins = hn::Mul(merged_mins, sign_prod);
+
+      // update r
+      hn::StoreU(merged_mins, di16x8, ptr_r);
+
+      // update l
+      Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l);
+      llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins);
+      hn::StoreU(llrs_reg, di16x8, ptr_l);
+
+      ptr_l += num_lanes;
+      ptr_r += num_lanes;
+      sgn_scratch_buf += num_lanes;
+      sgn_buf += num_lanes;
+      min1_buf += num_lanes;
+      min2_buf += num_lanes;
+      pos_buf += num_lanes;
+    }
+
+    if (tail2 > 0U) {
+      Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min1_buf);
+      Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min2_buf);
+      Vec_u16x8 pos = no_sanitize::MaskedLoad(hn::RebindMask(du16x8, pg_tail2),
+                                              du16x8, pos_buf);
+
+      // check if this the column matches position for the min1
+      Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current));
+
+      // if yes replace min1 with min2, otherwise min1
+      Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1);
+
+      // apply sign
+      Vec_i16x8 signs =
+          no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_scratch_buf);
+      merged_mins = hn::Mul(merged_mins, signs);
+
+      // apply sign product
+      Vec_i16x8 sign_prod = no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_buf);
+      merged_mins = hn::Mul(merged_mins, sign_prod);
+
+      // update r
+      hn::StoreN(merged_mins, di16x8, ptr_r, tail2);
+
+      // update l
+      Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_l);
+      llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins);
+      hn::StoreN(llrs_reg, di16x8, ptr_l, tail2);
+
+      ptr_l += tail2;
+      ptr_r += tail2;
+      sgn_scratch_buf += tail2;
+      sgn_buf += tail2;
+      min1_buf += tail2;
+      min2_buf += tail2;
+      pos_buf += tail2;
+    }
+
+    r_i++;
+  }
+
+  // update r index for next layer
+  *r_index = r_i;
+}
+
+// Variable nodes transmit their belief information to the connected check
+// nodes. Decoding alogrithm implemented is scaled offset min-sum. outputs mins,
+// signs and sign product to update the total belief.
+HWY_FORCED_INLINE void compute_l_r_and_mins(
+    int16_t *__restrict__ l, int16_t *__restrict__ r,
+    const armral_ldpc_base_graph_t *graph, uint16_t z, uint32_t lsi,
+    uint16_t layer, int16_t *__restrict__ row_min1_array,
+    int16_t *__restrict__ row_min2_array, int16_t *__restrict__ row_sign_array,
+    uint16_t *__restrict__ row_pos_array, int16_t *__restrict__ sign_scratch,
+    uint32_t *__restrict__ r_index) {
+
+  const uint32_t *col_indices;
+  uint32_t i;
+  uint32_t j;
+  uint32_t r_i = *r_index;
+  uint32_t t_i = 0;
+  uint32_t num_lanes = hn::Lanes(di16x8);
+
+  i = graph->row_start_inds[layer];
+  // Get the number of nonzero entries in the row
+  j = graph->row_start_inds[layer + 1] - i;
+  col_indices = graph->col_inds + i;
+  const uint32_t *shift_ptr = graph->shifts + i * 8 + lsi * j;
+
+  int16_t *sgn_scratch_buf = sign_scratch;
+
+  Vec_i16x8 offset8 = hn::Set(di16x8, 2);
+
+  Vec_i16x8 plus1 = hn::Set(di16x8, 1);
+  Vec_i16x8 minus1 = hn::Set(di16x8, -1);
+
+  // for each column i.e only non -1's
+  for (uint32_t col = 0; col < j; col++) {
+    uint32_t col_block = col_indices[col];
+
+    int16_t *ptr_r = &r[r_i * z];
+    uint32_t shift = shift_ptr[col] % z;
+
+    uint32_t blk1 = (z - shift) / num_lanes;
+    uint32_t blk2 = shift / num_lanes;
+    uint32_t tail1 = (z - shift) & (num_lanes - 1);
+    uint32_t tail2 = (shift) & (num_lanes - 1);
+    Mask_i16x8 pg_tail1 = hn::FirstN(di16x8, tail1);
+    Mask_i16x8 pg_tail2 = hn::FirstN(di16x8, tail2);
+
+    int16_t *min1_buf = row_min1_array;
+    int16_t *min2_buf = row_min2_array;
+    int16_t *sgn_buf = row_sign_array; // set to 0
+    uint16_t *pos_buf = row_pos_array;
+
+    // Loop over z
+    // shift to z-1
+    int16_t *ptr_l = &l[col_block * z + shift]; // Input,point to shift
+
+    for (uint32_t v_cnt = 0; v_cnt < blk1; v_cnt++) {
+      Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l);
+      Vec_i16x8 r_reg = hn::LoadU(di16x8, ptr_r);
+
+      // Subtraction
+      Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg);
+
+      // Absoluate
+      Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16);
+
+      // Store signs
+      Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1);
+      hn::StoreU(signs, di16x8, sgn_scratch_buf);
+
+      // Sign product
+      Vec_i16x8 old_sgn = hn::LoadU(di16x8, sgn_buf);
+      Vec_i16x8 sgn = hn::Mul(signs, old_sgn);
+      // store updated sign
+      hn::StoreU(sgn, di16x8, sgn_buf);
+
+      // store updated L
+      hn::StoreU(vec16, di16x8, ptr_l);
+
+      // Find min1 and min2
+      Vec_i16x8 min1_old = hn::LoadU(di16x8, min1_buf);
+      Vec_i16x8 min2_old = hn::LoadU(di16x8, min2_buf);
+
+      Vec_i16x8 min2 = hn::Max(min1_old, hn::Min(min2_old, abs_vec16));
+      Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old);
+
+      // find min1 position
+      // check if the current min1 has changed w.r.t previous
+      // if it has changed, then update the index to current pos
+      Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old));
+      Vec_u16x8 pos_old = hn::LoadU(du16x8, pos_buf);
+      Vec_u16x8 pos_cur = hn::Set(du16x8, col);
+      Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur);
+
+      hn::StoreU(min2, di16x8, min2_buf);
+      hn::StoreU(min1, di16x8, min1_buf);
+      hn::StoreU(pos_updt, du16x8, pos_buf);
+
+      ptr_l += num_lanes;
+      ptr_r += num_lanes;
+      min1_buf += num_lanes;
+      min2_buf += num_lanes;
+      sgn_buf += num_lanes;
+      pos_buf += num_lanes;
+      sgn_scratch_buf += num_lanes;
+    }
+
+    if (tail1 > 0U) {
+      Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_l);
+      Vec_i16x8 r_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_r);
+
+      // Subtraction
+      Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg);
+
+      // Absolute
+      Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16);
+
+      // Store signs
+      Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1);
+      hn::StoreN(signs, di16x8, sgn_scratch_buf, tail1);
+
+      // Sign product
+      Vec_i16x8 old_sgn = no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_buf);
+      Vec_i16x8 sgn = hn::Mul(signs, old_sgn);
+      // store updated sign
+      hn::StoreN(sgn, di16x8, sgn_buf, tail1);
+
+      // store updated L
+      hn::StoreN(vec16, di16x8, ptr_l, tail1);
+
+      // Find min1 and min2
+      Vec_i16x8 min1_old = no_sanitize::MaskedLoad(pg_tail1, di16x8, min1_buf);
+      Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min2_buf);
+
+      min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16));
+      Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old);
+
+      // Find min1 pos
+      //  check if the current min1 has changed w.r.t previous
+      //  if it has changed, then update the index to current pos
+      Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old));
+      Vec_u16x8 pos_old = no_sanitize::MaskedLoad(
+          hn::RebindMask(du16x8, pg_tail1), du16x8, pos_buf);
+      Vec_u16x8 pos_cur = hn::Set(du16x8, col);
+      Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur);
+
+      hn::StoreN(min2, di16x8, min2_buf, tail1);
+      hn::StoreN(min1, di16x8, min1_buf, tail1);
+      hn::StoreN(pos_updt, du16x8, pos_buf, tail1);
+
+      ptr_l += tail1;
+      ptr_r += tail1;
+      min1_buf += tail1;
+      min2_buf += tail1;
+      sgn_buf += tail1;
+      pos_buf += tail1;
+      sgn_scratch_buf += tail1;
+    }
+    // 0 to shift-1
+    ptr_l = &l[col_block * z]; // point to start
+    for (uint32_t v_cnt = 0; v_cnt < blk2; v_cnt++) {
+      Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l);
+      Vec_i16x8 r_reg = hn::LoadU(di16x8, ptr_r);
+
+      // Subtraction
+      Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg);
+
+      // Absoluate
+      Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16);
+
+      // Store signs
+      Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1);
+      hn::StoreU(signs, di16x8, sgn_scratch_buf);
+
+      // Sign product
+      Vec_i16x8 old_sgn = hn::LoadU(di16x8, sgn_buf);
+      Vec_i16x8 sgn = hn::Mul(signs, old_sgn);
+      // store updated sign
+      hn::StoreU(sgn, di16x8, sgn_buf);
+
+      // store updated L
+      hn::StoreU(vec16, di16x8, ptr_l);
+
+      // Find min1 and min2
+      Vec_i16x8 min1_old = hn::LoadU(di16x8, min1_buf);
+      Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf);
+
+      min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16));
+      Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old);
+
+      // find min1 position
+      // check if the current min1 has changed w.r.t previous
+      // if it has changed, then update the index to current pos
+      Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old));
+      Vec_u16x8 pos_old = hn::LoadU(du16x8, pos_buf);
+      Vec_u16x8 pos_cur = hn::Set(du16x8, col);
+      Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur);
+
+      hn::StoreU(min2, di16x8, min2_buf);
+      hn::StoreU(min1, di16x8, min1_buf);
+      hn::StoreU(pos_updt, du16x8, pos_buf);
+
+      ptr_l += num_lanes;
+      ptr_r += num_lanes;
+      min1_buf += num_lanes;
+      min2_buf += num_lanes;
+      sgn_buf += num_lanes;
+      pos_buf += num_lanes;
+      sgn_scratch_buf += num_lanes;
+    }
+
+    if (tail2 > 0U) {
+      Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_l);
+      Vec_i16x8 r_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_r);
+
+      // Subtraction
+      Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg);
+
+      // Absolute
+      Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16);
+
+      // Store signs
+      Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1);
+      hn::StoreN(signs, di16x8, sgn_scratch_buf, tail2);
+
+      // Sign product
+      Vec_i16x8 old_sgn = no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_buf);
+      Vec_i16x8 sgn = hn::Mul(signs, old_sgn);
+      // store updated sign
+      hn::StoreN(sgn, di16x8, sgn_buf, tail2);
+
+      // store updated L
+      hn::StoreN(vec16, di16x8, ptr_l, tail2);
+
+      // Find min1 and min2
+      Vec_i16x8 min1_old = no_sanitize::MaskedLoad(pg_tail2, di16x8, min1_buf);
+      Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min2_buf);
+
+      min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16));
+      Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old);
+
+      // Find min1 pos
+      //  check if the current min1 has changed w.r.t previous
+      //  if it has changed, then update the index to current pos
+      Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old));
+      Vec_u16x8 pos_old = no_sanitize::MaskedLoad(
+          hn::RebindMask(du16x8, pg_tail2), du16x8, pos_buf);
+      Vec_u16x8 pos_cur = hn::Set(du16x8, col);
+      Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur);
+
+      hn::StoreN(min2, di16x8, min2_buf, tail2);
+      hn::StoreN(min1, di16x8, min1_buf, tail2);
+      hn::StoreN(pos_updt, du16x8, pos_buf, tail2);
+
+      ptr_l += tail2;
+      ptr_r += tail2;
+      min1_buf += tail2;
+      min2_buf += tail2;
+      sgn_buf += tail2;
+      pos_buf += tail2;
+      sgn_scratch_buf += tail2;
+    }
+    r_i++;
+    t_i++;
+  }
+
+  *r_index = r_i - t_i;
+
+  // offset and scale min1 and min2
+  // in the same loop, adjust sign product
+  uint32_t blk = z / num_lanes;
+  uint32_t tail = z & (num_lanes - 1);
+  Mask_i16x8 pg_tail = hn::FirstN(di16x8, tail);
+
+  Vec_i16x8 scale = hn::Set(di16x8, 24576); // 0.75
+  int16_t *min1_buf = row_min1_array;
+  int16_t *min2_buf = row_min2_array;
+  int16_t *sgn_buf = row_sign_array;
+
+  for (uint16_t z1 = 0; z1 < blk; z1++) {
+    Vec_i16x8 sgn = hn::LoadU(di16x8, sgn_buf);
+    sgn = hn::IfNegativeThenElse(sgn, minus1, plus1);
+    hn::StoreU(sgn, di16x8, sgn_buf);
+
+    Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf);
+    Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf);
+
+    // apply offset
+    Vec_i16x8 vec1 = hn::SaturatedSub(min1, offset8);
+    Vec_i16x8 vec2 = hn::SaturatedSub(min2, offset8);
+
+    // if min1 < 0, then min1 = 0;
+    vec1 = hn::IfNegativeThenElse(vec1, hn::Zero(di16x8), vec1);
+
+    // apply scale
+    min1 = hn::MulFixedPoint15(vec1, scale);
+
+    // if min2 < 0, then min1 = 0;
+    vec2 = hn::IfNegativeThenElse(vec2, hn::Zero(di16x8), vec2);
+
+    // apply scale
+    min2 = hn::MulFixedPoint15(vec2, scale);
+
+    // store scaled, offseted min's
+    hn::StoreU(min1, di16x8, min1_buf);
+    hn::StoreU(min2, di16x8, min2_buf);
+
+    min1_buf += num_lanes;
+    min2_buf += num_lanes;
+    sgn_buf += num_lanes;
+  }
+
+  if (tail > 0U) {
+    Vec_i16x8 sgn = no_sanitize::MaskedLoad(pg_tail, di16x8, sgn_buf);
+    sgn = hn::IfNegativeThenElse(sgn, minus1, plus1);
+    hn::StoreN(sgn, di16x8, sgn_buf, tail);
+
+    Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail, di16x8, min1_buf);
+    Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail, di16x8, min2_buf);
+
+    // apply offset
+    Vec_i16x8 vec1 = hn::SaturatedSub(min1, offset8);
+    Vec_i16x8 vec2 = hn::SaturatedSub(min2, offset8);
+
+    // if min1 < 0, then min1 = 0
+    vec1 = hn::IfNegativeThenElse(vec1, hn::Zero(di16x8), vec1);
+
+    // apply scale
+    min1 = hn::MulFixedPoint15(vec1, scale);
+
+    vec2 = hn::IfNegativeThenElse(vec2, hn::Zero(di16x8), vec2);
+
+    min2 = hn::MulFixedPoint15(vec2, scale);
+
+    hn::StoreN(min1, di16x8, min1_buf, tail);
+    hn::StoreN(min2, di16x8, min2_buf, tail);
+    min1_buf += tail;
+    min2_buf += tail;
+    sgn_buf += tail;
+  }
+}
+
+HWY_FORCED_INLINE bool hard_decision(int16_t *ptr_l, uint8_t *crc_buff,
+                                     uint8_t *ptr_data, uint32_t k,
+                                     uint32_t crc_flag) {
+
+  uint32_t num_lanes = hn::Lanes(di16x8);
+  uint32_t k_prime = k + 24;
+  uint32_t full_vec = (k_prime) / num_lanes;
+  uint32_t tail_cnt = (k_prime) & (num_lanes - 1);
+  uint8_t *data = (uint8_t *)crc_buff;
+  uint32_t pad_bytes = 0;
+
+  // if the decoded data is less than 8 bytes / not multiple of 8 bytes, prefix
+  // zero padding
+  if (crc_flag != 0U) {
+    if (((k_prime >> 3) % 16) != 0U) {
+      pad_bytes = 16 - ((k_prime >> 3) % 16);
+      memset(data, 0, pad_bytes);
+      data = data + pad_bytes;
+    }
+  }
+
+  const Vec_i16x8 ones =
+      hn::Dup128VecFromValues(di16x8, 128, 64, 32, 16, 8, 4, 2, 1);
+  Mask_i16x8 pg_tail = hn::FirstN(di16x8, tail_cnt);
+
+  for (uint32_t v_cnt = 0; v_cnt < full_vec; v_cnt++) {
+    Vec_i16x8 d = hn::LoadU(di16x8, ptr_l);
+
+    Vec_u16x8 is_negative =
+        hn::BitCast(du16x8, hn::IfNegativeThenElseZero(d, ones));
+    uint8_t byte1 = (uint8_t)hn::ReduceSum(du16x8, is_negative);
+
+    *data++ = byte1;
+    ptr_l += num_lanes;
+  }
+
+  if (tail_cnt != 0U) {
+    Vec_i16x8 d = no_sanitize::MaskedLoad(pg_tail, di16x8, ptr_l);
+    Vec_u16x8 is_negative =
+        hn::BitCast(du16x8, hn::IfNegativeThenElseZero(d, ones));
+    uint8_t byte1 = (uint8_t)hn::ReduceSum(du16x8, is_negative);
+    *data++ = byte1;
+    ptr_l += tail_cnt;
+  }
+
+  // Generate the CRC parity bits
+  uint64_t crc = 0;
+  if (crc_flag != 0U) {
+    armral_crc24_b_be((k_prime >> 3) + pad_bytes, (const uint64_t *)crc_buff,
+                      &crc);
+    // Removing the Zero padding
+    if (pad_bytes != 0U) {
+      for (uint32_t i = 0; i < (k_prime >> 3); i++) {
+        ptr_data[i] = crc_buff[i + pad_bytes];
+      }
+    }
+  } else {
+    memcpy(ptr_data, crc_buff, (k + 7) >> 3);
+  }
+
+  return (crc == 0U);
+}
+
+HWY_FORCED_INLINE void load_ptr_l(int16_t *ptr_l, const int8_t *llrs_ptr,
+                                  uint32_t len_in) {
+  uint32_t num_lanes = hn::Lanes(di16x8);
+  uint32_t full_blk = len_in / num_lanes;
+  uint32_t tail_cnt = len_in % num_lanes;
+
+  for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+    Vec_i16x8 vec_16 = hn::PromoteTo(di16x8, hn::LoadU(di8x8, llrs_ptr));
+    hn::StoreU(vec_16, di16x8, ptr_l);
+    ptr_l += num_lanes;
+    llrs_ptr += num_lanes;
+  }
+
+  if (tail_cnt != 0U) {
+    for (uint32_t i = 0; i < tail_cnt; i++) {
+      ptr_l[i] = (int16_t)llrs_ptr[i];
+    }
+  }
+}
+
+template<typename Allocator>
+bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+                  uint32_t crc_idx, uint32_t num_its, uint8_t *data_out,
+                  Allocator &allocator) {
+
+  bool crc_passed = false;
+
+  // Get the base graph and the lifting size
+  const auto *graph = armral_ldpc_get_base_graph(bg);
+  uint32_t lsi = get_lifting_index(z);
+
+  // (graph->row_start_inds[2] - graph->row_start_inds[1]) Max no of non -1 in
+  // columns is 19. Note: Min is calculated over 8 byte block lengths, so need
+  // lesser memory
+  uint32_t layer_size = 19 * z;
+
+  uint32_t num_llrs = (graph->ncodeword_bits + 2) * z;
+  // max no of non zeros enteries in H Matrix is 316
+  uint32_t r_mat_size = 316 * z;
+
+  auto l = allocate_uninitialized<int16_t>(allocator, num_llrs);
+  // matrix R (check-to-variable-node messages)
+  auto r = allocate_zeroed<int16_t>(allocator, r_mat_size);
+
+  uint32_t num_lanes = hn::Lanes(di16x8);
+  uint32_t z_len = z / num_lanes;
+  uint32_t offset = (z % num_lanes) ? 1 : 0;
+  z_len = (z_len + offset) * num_lanes;
+  uint32_t k = (bg == 0) ? (z * 22) : (z * 10);
+
+  auto row_min1_array = allocate_uninitialized<int16_t>(allocator, z_len);
+  auto row_min2_array = allocate_uninitialized<int16_t>(allocator, z_len);
+  auto row_sign_array = allocate_zeroed<int16_t>(allocator, z_len);
+  auto row_pos_array = allocate_zeroed<uint16_t>(allocator, z);
+  auto sign_scratch = allocate_uninitialized<int16_t>(allocator, layer_size);
+  auto crc_buff = allocate_zeroed<uint8_t>(allocator, ((k + 7) >> 3) + 15);
+
+  // NOTE: All allocations are now done!
+  if constexpr (Allocator::is_counting) {
+    return false;
+  }
+
+  uint32_t r_index = 0;
+
+  // initialization with channel LLRs. 16-bit buffer "l" will be used for
+  // in-place calculations
+  int16_t *ptr_l = l.get();
+  const auto *llrs_ptr = llrs;
+
+  // 0 memset 2z LLRs from input to fill the punctured bits
+  memset(ptr_l, 0, sizeof(int16_t) * 2 * z);
+  ptr_l = ptr_l + 2 * z;
+
+  load_ptr_l(ptr_l, llrs_ptr, graph->ncodeword_bits * z);
+
+  uint32_t full_blk = z_len / num_lanes;
+
+  for (uint32_t it = 0; it < num_its; ++it) {
+    r_index = 0;
+    for (uint32_t layer = 0; layer < graph->nrows; layer++) {
+
+      // reset the sign buffer
+      memset(row_sign_array.get(), 0, sizeof(int16_t) * z);
+
+      // reset the min1 min2 buf to max
+      int16_t *ptr1 = row_min1_array.get();
+      int16_t *ptr2 = row_min2_array.get();
+      int16_t *ptr3 = row_sign_array.get();
+
+      for (uint32_t i = 0; i < full_blk; i++) {
+        Vec_i16x8 v8 = hn::Set(di16x8, 0x7FFF);
+        Vec_i16x8 v_sign8 = hn::Set(di16x8, 0x1);
+        hn::StoreU(v8, di16x8, ptr1);
+        hn::StoreU(v8, di16x8, ptr2);
+        hn::StoreU(v_sign8, di16x8, ptr3);
+
+        ptr1 += num_lanes;
+        ptr2 += num_lanes;
+        ptr3 += num_lanes;
+      }
+
+      compute_l_r_and_mins(l.get(), r.get(), graph, z, lsi, layer,
+                           row_min1_array.get(), row_min2_array.get(),
+                           row_sign_array.get(), row_pos_array.get(),
+                           sign_scratch.get(), &r_index);
+
+      update_l_and_r(l.get(), r.get(), graph, z, lsi, layer,
+                     row_min1_array.get(), row_min2_array.get(),
+                     row_sign_array.get(), row_pos_array.get(),
+                     sign_scratch.get(), &r_index);
+    }
+
+    // early exit if crc Passes
+    if (crc_idx) {
+      if (it < (num_its - 1)) {
+        crc_passed =
+            hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true);
+        if (crc_passed) {
+          return crc_passed;
+        }
+      }
+    }
+  }
+
+  if (crc_idx == ARMRAL_LDPC_NO_CRC) { // do only decisions
+    crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0],
+                               graph->nmessage_bits * z, false);
+  } else {
+    crc_passed =
+        hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true);
+  }
+  return crc_passed;
+}
+
+} // namespace armral::ldpc
+
+template bool armral::ldpc::decode_block<heap_allocator>(
+    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
+    uint32_t num_its, uint8_t *data_out, heap_allocator &);
+
+template bool armral::ldpc::decode_block<buffer_bump_allocator>(
+    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
+    uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &);
+
+armral_status armral_ldpc_decode_block(const int8_t *llrs,
+                                       armral_ldpc_graph_t bg, uint32_t z,
+                                       uint32_t crc_idx, uint32_t num_its,
+                                       uint8_t *data_out) {
+
+  heap_allocator allocator{};
+  bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its,
+                                           data_out, allocator);
+  return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL;
+}
+
+armral_status
+armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg,
+                                 uint32_t z, uint32_t crc_idx, uint32_t num_its,
+                                 uint8_t *data_out, void *buffer) {
+
+  buffer_bump_allocator allocator{buffer};
+  bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its,
+                                           data_out, allocator);
+  return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL;
+}
+
+uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
+                                                      uint32_t z,
+                                                      uint32_t crc_idx,
+                                                      uint32_t num_its) {
+  counting_allocator allocator{};
+  armral::ldpc::decode_block(nullptr, bg, z, crc_idx, num_its, nullptr,
+                             allocator);
+  return allocator.required_bytes();
+}
diff --git a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp b/src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp
similarity index 100%
rename from src/UpperPHY/LDPC/highway/ldpc_encoder.cpp
rename to src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp
diff --git a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
deleted file mode 100644
index ea6d98441e944d258b085b7947c4085a201fedc4..0000000000000000000000000000000000000000
--- a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp
+++ /dev/null
@@ -1,967 +0,0 @@
-/*
-    Arm RAN Acceleration Library
-    SPDX-FileCopyrightText: <text>Copyright 2020-2025 Arm Limited and/or its
-    affiliates <open-source-office@arm.com></text>
-    SPDX-License-Identifier: BSD-3-Clause
-*/
-
-#include "../ldpc_coding.hpp"
-#include "utils/allocators.hpp"
-#include "utils/bits_to_bytes.hpp"
-
-#include "utils/hwy_types.hpp"
-namespace hn = hwy::HWY_NAMESPACE;
-
-#include <cmath>
-#include <cstring>
-#include <optional>
-
-namespace {
-using Mask_i16 = hn::Mask<decltype(di16)>;
-
-inline int16_t __attribute__((always_inline)) sat_abs_16(int16_t a) {
-  int16_t partial_res = abs((int32_t)a);
-  if (partial_res > INT8_MAX) {
-    return INT8_MAX;
-  }
-  if (partial_res < INT8_MIN) {
-    return INT8_MIN;
-  }
-  return partial_res;
-}
-
-inline int16_t __attribute__((always_inline)) sat_add_16(int16_t a, int16_t b) {
-  int16_t partial_res = (uint32_t)a + (uint32_t)b;
-  if (partial_res > INT8_MAX) {
-    return INT8_MAX;
-  }
-  if (partial_res < INT8_MIN) {
-    return INT8_MIN;
-  }
-  return partial_res;
-}
-
-inline int16_t __attribute__((always_inline)) sat_sub_16(int16_t a, int16_t b) {
-  int16_t partial_res = (int32_t)a - (int32_t)b;
-  if (partial_res > INT8_MAX) {
-    return INT8_MAX;
-  }
-  if (partial_res < INT8_MIN) {
-    return INT8_MIN;
-  }
-  return partial_res;
-}
-
-struct ldpc_layer_data {
-  uint32_t z;
-  uint32_t lsi;
-  uint32_t row;
-  uint32_t row_start_ind;
-  const armral_ldpc_base_graph_t *graph;
-  uint32_t num_cols;
-  const uint32_t *shift_ptr;
-  const uint32_t *col_ptr;
-
-  ldpc_layer_data(uint32_t z_in, uint32_t lsi_in,
-                  const armral_ldpc_base_graph_t *graph_in)
-    : z(z_in), lsi(lsi_in), row(0), row_start_ind(0), graph(graph_in),
-      num_cols(graph->row_start_inds[1]),
-      shift_ptr(graph->shifts + lsi * num_cols), col_ptr(graph->col_inds) {}
-
-  void next() {
-    row++;
-    row_start_ind = graph->row_start_inds[row];
-    col_ptr += num_cols;
-    num_cols = graph->row_start_inds[row + 1] - row_start_ind;
-    shift_ptr = graph->shifts + row_start_ind * armral::ldpc::num_lifting_sets +
-                lsi * num_cols;
-  }
-};
-
-template<typename T>
-inline T max(T a, T b) {
-  return a > b ? a : b;
-}
-
-template<typename T>
-inline T min(T a, T b) {
-  return a < b ? a : b;
-}
-
-enum lifting_size_category { CAT_TINY, CAT_TAIL, CAT_LARGE };
-
-template<typename Allocator>
-class crc_checker {
-public:
-  crc_checker(uint32_t z, uint32_t crc_idx, Allocator &allocator) : m_z(z) {
-    // Calculate K', which is the number of info bits + CRC bits (i.e. the
-    // non-filler bits of the code block)
-    m_k_prime = crc_idx + 24;
-
-    // The CRC calculation routine expects a particular size of input (n % 16 = 0
-    // where n is the number of bytes), which requires padding the input to the
-    // required size
-    m_buffer_size = (m_k_prime + 7) / 8;
-    m_total_bits = m_k_prime;
-    if (m_k_prime % 128 != 0) {
-      m_num_pad_bits = 128 - (m_k_prime % 128);
-      m_total_bits = m_k_prime + m_num_pad_bits;
-      m_buffer_size = m_total_bits >> 3;
-    }
-
-    m_llrs = allocate_uninitialized<int16_t>(allocator, m_total_bits + m_z - 1);
-    m_buffer = allocate_uninitialized<uint8_t>(allocator, m_buffer_size);
-  }
-
-  bool check(const int16_t *new_llrs) {
-    // Copy the LLRs corresponding to the bits we need to do the CRC check after
-    // the padding bits
-    memset(m_llrs.get(), 0, m_num_pad_bits * sizeof(int16_t));
-    for (uint32_t num_block = 0; num_block < ((m_k_prime + m_z - 1) / m_z);
-         num_block++) {
-      memcpy(m_llrs.get() + m_num_pad_bits + (num_block * m_z),
-             new_llrs + (2 * num_block * m_z), m_z * sizeof(int16_t));
-    }
-
-    // Hard decode
-    armral::llrs_to_bits(m_total_bits, m_llrs.get(), m_buffer.get());
-
-    // Generate the CRC parity bits
-    uint64_t crc;
-    armral_crc24_b_be(m_buffer_size, (const uint64_t *)m_buffer.get(), &crc);
-
-    // If the CRC is zero then the code block has been correctly decoded and we
-    // can terminate the iterations early
-    return (crc == 0);
-  }
-
-private:
-  uint32_t m_z{0};
-  uint32_t m_k_prime{0};
-  uint32_t m_buffer_size{0};
-  uint32_t m_num_pad_bits{0};
-  uint32_t m_total_bits{0};
-  unique_ptr<Allocator, int16_t> m_llrs;
-  unique_ptr<Allocator, uint8_t> m_buffer;
-};
-
-template<lifting_size_category>
-bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi,
-                  const armral_ldpc_base_graph_t *graph, int32_t num_lanes,
-                  int32_t full_vec, uint32_t tail_size, int16_t *check);
-
-template<>
-bool parity_check<CAT_TINY>(const int16_t *llrs, uint32_t z, uint32_t lsi,
-                            const armral_ldpc_base_graph_t *graph,
-                            int32_t num_lanes, int32_t full_vec,
-                            uint32_t tail_size, int16_t *check) {
-  // Loop through the rows in the base graph
-  bool passed = true;
-  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
-    auto row_start_ind = graph->row_start_inds[row];
-    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
-    const auto *col_ptr = graph->col_inds + row_start_ind;
-    const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral::ldpc::num_lifting_sets +
-                            lsi * num_cols;
-    // Loop through the rows in the block
-    for (uint32_t zb = 0; zb < z && passed; ++zb) {
-      // Loop through the columns in the row
-      int16_t scal_check = 0;
-      for (uint32_t col = 0; col < num_cols; ++col) {
-        auto shift = (shift_ptr[col] + zb) % z;
-        auto codeword_ind = col_ptr[col] * z + shift;
-        scal_check ^= llrs[codeword_ind];
-      }
-      passed &= scal_check >= 0;
-    }
-  }
-  return passed;
-}
-
-template<>
-bool parity_check<CAT_TAIL>(const int16_t *llrs, uint32_t z, uint32_t lsi,
-                            const armral_ldpc_base_graph_t *graph,
-                            int32_t num_lanes, int32_t full_vec,
-                            uint32_t tail_size, int16_t *check) {
-  // Loop through the rows in the base graph
-  bool passed = true;
-  Mask_i16 pg_tail = hn::FirstN(di16, (size_t)tail_size);
-
-  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
-    auto row_start_ind = graph->row_start_inds[row];
-    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
-    const auto *col_ptr = graph->col_inds + row_start_ind;
-    const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral::ldpc::num_lifting_sets +
-                            lsi * num_cols;
-    memset(check, 0, z * sizeof(int16_t));
-
-    // Loop through the columns
-    for (uint32_t col = 0; col < num_cols; ++col) {
-      auto shift = (shift_ptr[col] % z);
-      auto codeword_ind = col_ptr[col] * (2 * z) + shift;
-
-      // No need to loop here, as there is only a tail
-      const int16_t *llrs_ptr = llrs + codeword_ind;
-      int16_t *check_ptr = check;
-
-      Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-      Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr);
-      Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
-      hn::StoreN(result_reg, di16, check_ptr, tail_size);
-    }
-    for (uint32_t zb = 0; zb < z && passed; ++zb) {
-      passed &= check[zb] >= 0;
-    }
-  }
-  return passed;
-}
-
-template<>
-bool parity_check<CAT_LARGE>(const int16_t *llrs, uint32_t z, uint32_t lsi,
-                             const armral_ldpc_base_graph_t *graph,
-                             int32_t num_lanes, int32_t full_vec,
-                             uint32_t tail_size, int16_t *check) {
-  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
-
-  // Loop through the rows in the base graph
-  bool passed = true;
-  for (uint32_t row = 0; row < graph->nrows && passed; ++row) {
-    auto row_start_ind = graph->row_start_inds[row];
-    auto num_cols = graph->row_start_inds[row + 1] - row_start_ind;
-    const auto *col_ptr = graph->col_inds + row_start_ind;
-    const auto *shift_ptr = graph->shifts +
-                            row_start_ind * armral::ldpc::num_lifting_sets +
-                            lsi * num_cols;
-    memset(check, 0, z * sizeof(int16_t));
-
-    // Loop through the columns
-    for (uint32_t col = 0; col < num_cols; ++col) {
-      auto shift = (shift_ptr[col] % z);
-      auto codeword_ind = col_ptr[col] * (2 * z) + shift;
-      // Loop through the rows in the block
-
-      // The check can be done on the LLRs instead of on the bit values, as
-      // there is a one-to-one transform between LLRs and bit. Negative LLRs
-      // represent a hard decision for the bit to be one, and non-negative
-      // values represent a zero. Hence the check needs to xor all LLRs
-      // and then assert that the result is non-negative.
-      const int16_t *llrs_ptr = llrs + codeword_ind;
-      int16_t *check_ptr = check;
-
-      for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
-        Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
-        Vec_i16 check_reg = hn::LoadU(di16, check_ptr);
-        Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
-        hn::StoreU(result_reg, di16, check_ptr);
-
-        // Increment pointers
-        llrs_ptr += num_lanes;
-        check_ptr += num_lanes;
-      }
-      // Process tail
-      if (tail_size != 0) {
-        Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-        Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr);
-        Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg);
-        hn::StoreN(result_reg, di16, check_ptr, tail_size);
-      }
-    }
-    for (uint32_t zb = 0; zb < z && passed; ++zb) {
-      passed &= check[zb] >= 0;
-    }
-  }
-  return passed;
-}
-
-// For each check node m in the layer, compute:
-// - the variable-to-check-node messages L(n,m) for each variable node n in
-//   \psi(m), where \psi(m) is the set of variable nodes connected to m:
-//   L(n,m) = LLR(n) - R(n,m)
-// - the products \prod_{n' \in \psi(m)} L(n',m) (they will be used to compute
-//   sign(R(n,m)) in a second step)
-// - \min_{n \in \psi(m)} |L(n,m)| and the second minimum (they will be used to
-//   compute |R(n,m)| in a second step)
-template<lifting_size_category>
-void compute_l_product_min1_and_min2(
-    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
-    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
-    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
-    int16_t *row_sign_array);
-
-template<>
-void compute_l_product_min1_and_min2<CAT_TINY>(
-    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
-    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
-    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
-    int16_t *row_sign_array) {
-  const auto *r_ptr = r;
-  // Loop through the Z rows in the layer (check node m)
-  for (uint32_t zb = 0; zb < d->z; ++zb) {
-    // Loop through the columns in the row (variable node n in psi(m))
-    // Column 0
-    auto shift = (d->shift_ptr[0] + zb) % d->z;
-    int16_t l_val = sat_sub_16(llrs[d->col_ptr[0] * d->z + shift], *(r_ptr++));
-
-    int16_t row_sign = l_val;
-
-    int16_t row_min = sat_abs_16(l_val);
-
-    *(l++) = l_val;
-
-    // Column 1
-    shift = (d->shift_ptr[1] + zb) % d->z;
-    l_val = sat_sub_16(llrs[d->col_ptr[1] * d->z + shift], *(r_ptr++));
-
-    row_sign ^= l_val;
-
-    int16_t abs_val = sat_abs_16(l_val);
-    int16_t row_min2 = max(row_min, abs_val);
-    row_min = min(row_min, abs_val);
-
-    *(l++) = l_val;
-
-    // Columns n >= 2
-    for (uint32_t col = 2; col < d->num_cols; ++col) {
-      // Compute L(n,m) = LLR(n) - R(n,m)
-      shift = (d->shift_ptr[col] + zb) % d->z;
-      l_val = sat_sub_16(llrs[d->col_ptr[col] * d->z + shift], *(r_ptr++));
-
-      // Compute the product of L(n',m), for all the columns (all n' in psi(m))
-      row_sign ^= l_val;
-
-      // Compute the min(|L(n,m)|) and the second minimum
-      abs_val = sat_abs_16(l_val);
-      row_min2 = max(row_min, min(row_min2, abs_val));
-      row_min = min(row_min, abs_val);
-
-      // Store L(n,m)
-      *(l++) = l_val;
-    }
-
-    // Store the two minima and the product for Z rows
-    row_min_array[zb] = row_min;
-    row_min2_array[zb] = row_min2;
-    row_sign_array[zb] = row_sign;
-  }
-}
-
-template<>
-void compute_l_product_min1_and_min2<CAT_TAIL>(
-    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
-    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
-    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
-    int16_t *row_sign_array) {
-  // Case for lifting sizes Z such as 8 <= Z < 16
-  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
-
-  // Loop through the columns in the row (variable node n in psi(m))
-  // Column 0
-  int16_t *l_ptr = l;
-  auto shift = d->shift_ptr[0] % d->z;
-  const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift;
-  const int16_t *r_ptr = r;
-
-  Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-  Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-  Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-  Vec_i16 row_sign = l_reg;
-
-  Vec_i16 row_min = hn::SaturatedAbs(l_reg);
-
-  hn::StoreN(l_reg, di16, l_ptr, tail_size);
-
-  // Column 1
-  l_ptr = l + d->z;
-  shift = d->shift_ptr[1] % d->z;
-  llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift;
-  r_ptr = r + d->z;
-
-  r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-  llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-  l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-  row_sign = hn::Xor(row_sign, l_reg);
-
-  Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-  Vec_i16 row_min2 = hn::Max(row_min, abs_reg);
-  row_min = hn::Min(row_min, abs_reg);
-
-  hn::StoreN(l_reg, di16, l_ptr, tail_size);
-
-  // Columns n >= 2
-  for (uint32_t col = 2; col < d->num_cols; ++col) {
-    l_ptr = l + d->z * col;
-    shift = d->shift_ptr[col] % d->z;
-    llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift;
-    r_ptr = r + d->z * col;
-
-    // Compute L(n,m) = LLR(n) - R(n,m)
-    r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-    llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-    l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-    // Compute the product of L(n',m), for all the columns (all n' in psi(m))
-    row_sign = hn::Xor(row_sign, l_reg);
-
-    // Compute the min(|L(n,m)|) and the second minimum
-    abs_reg = hn::SaturatedAbs(l_reg);
-    row_min2 = hn::Max(row_min, hn::Min(row_min2, abs_reg));
-    row_min = hn::Min(row_min, abs_reg);
-
-    // Store L(n,m)
-    hn::StoreN(l_reg, di16, l_ptr, tail_size);
-  }
-
-  // Store the two minima and the product for Z rows
-  hn::StoreN(row_min, di16, row_min_array, tail_size);
-  hn::StoreN(row_min2, di16, row_min2_array, tail_size);
-  hn::StoreN(row_sign, di16, row_sign_array, tail_size);
-}
-
-template<>
-void compute_l_product_min1_and_min2<CAT_LARGE>(
-    int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r,
-    const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec,
-    uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array,
-    int16_t *row_sign_array) {
-  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
-
-  // Loop through the columns in the row (variable node n in psi(m))
-  // Column 0
-  int16_t *l_ptr = l;
-  auto shift = d->shift_ptr[0] % d->z;
-  const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift;
-  const int16_t *r_ptr = r;
-  int16_t *sign_ptr = row_sign_array;
-  int16_t *min_ptr = row_min_array;
-
-  for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
-    Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
-    Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
-    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-    hn::StoreU(l_reg, di16, sign_ptr);
-
-    hn::StoreU(hn::SaturatedAbs(l_reg), di16, min_ptr);
-
-    hn::StoreU(l_reg, di16, l_ptr);
-
-    sign_ptr += num_lanes;
-    min_ptr += num_lanes;
-    r_ptr += num_lanes;
-    l_ptr += num_lanes;
-    llrs_ptr += num_lanes;
-  }
-
-  if (tail_size != 0) {
-    Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-    Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-    hn::StoreN(l_reg, di16, sign_ptr, tail_size);
-
-    hn::StoreN(hn::SaturatedAbs(l_reg), di16, min_ptr, tail_size);
-
-    hn::StoreN(l_reg, di16, l_ptr, tail_size);
-  }
-
-  // Column 1
-  shift = d->shift_ptr[1] % d->z;
-  l_ptr = l + d->z;
-  llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift;
-  r_ptr = r + d->z;
-  sign_ptr = row_sign_array;
-  min_ptr = row_min_array;
-  int16_t *min2_ptr = row_min2_array;
-
-  for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
-    Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
-    Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
-    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-    Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
-    hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr);
-
-    Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
-    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-    hn::StoreU(hn::Max(min_reg, abs_reg), di16, min2_ptr);
-    hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr);
-
-    hn::StoreU(l_reg, di16, l_ptr);
-
-    sign_ptr += num_lanes;
-    min_ptr += num_lanes;
-    min2_ptr += num_lanes;
-    r_ptr += num_lanes;
-    l_ptr += num_lanes;
-    llrs_ptr += num_lanes;
-  }
-
-  if (tail_size != 0) {
-    Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-    Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-    Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-    Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
-    hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size);
-
-    Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
-    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-    hn::StoreN(hn::Max(min_reg, abs_reg), di16, min2_ptr, tail_size);
-    hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size);
-
-    hn::StoreN(l_reg, di16, l_ptr, tail_size);
-  }
-
-  // Columns n >= 2
-  for (uint32_t col = 2; col < d->num_cols; ++col) {
-    l_ptr = l + d->z * col;
-    shift = d->shift_ptr[col] % d->z;
-    llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift;
-    r_ptr = r + d->z * col;
-    sign_ptr = row_sign_array;
-    min_ptr = row_min_array;
-    min2_ptr = row_min2_array;
-
-    // Loop through the Z rows in the layer (check node m)
-    for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
-      // Compute L(n,m) = LLR(n) - R(n,m)
-      Vec_i16 r_reg = hn::LoadU(di16, r_ptr);
-      Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr);
-      Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-      // Compute the product of L(n',m), for all the columns (all n' in psi(m))
-      Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
-      hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr);
-
-      // Compute the min(|L(n,m)|) and the second minimum
-      Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
-      Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr);
-      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-      hn::StoreU(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr);
-      hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr);
-
-      // Store L(n,m)
-      hn::StoreU(l_reg, di16, l_ptr);
-
-      sign_ptr += num_lanes;
-      min_ptr += num_lanes;
-      min2_ptr += num_lanes;
-      r_ptr += num_lanes;
-      l_ptr += num_lanes;
-      llrs_ptr += num_lanes;
-    }
-
-    // Process tail
-    if (tail_size != 0) {
-      Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr);
-      Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr);
-      Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg);
-
-      Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
-      hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size);
-
-      Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
-      Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr);
-      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-      hn::StoreN(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr,
-                 tail_size);
-      hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size);
-
-      hn::StoreN(l_reg, di16, l_ptr, tail_size);
-    }
-  }
-}
-
-// For each check node m in the layer, compute:
-// - The check-to-variable-node messages R(n,m) for each n in \psi(m), where
-//   \psi(m) is the set of variable nodes connected to check node m:
-//   sign(R(n,m)) = \prod_{n' \in \psi(m)/n} sign(L(n',m)) =
-//                = \prod_{n' \in \psi(m)} sign(L(n',m)) / sign(L(n,m))
-//   |R(n,m)| = \min_{n' \in \psi(m)/n} |L(n',m)| =
-//            = the first minimum when n' != n, the second minimum otherwise
-// - The log likelihood ratios for each n in \psi(m):
-//   LLR(n) = R(n,m) + L(n,m)
-template<lifting_size_category>
-void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs,
-                        const ldpc_layer_data *d, int32_t num_lanes,
-                        int32_t full_vec, uint32_t tail_size,
-                        const int16_t *row_min_array,
-                        const int16_t *row_min2_array,
-                        const int16_t *row_sign_array);
-
-template<>
-void compute_r_and_llrs<CAT_TINY>(const int16_t *l, int16_t *r, int16_t *llrs,
-                                  const ldpc_layer_data *d, int32_t num_lanes,
-                                  int32_t full_vec, uint32_t tail_size,
-                                  const int16_t *row_min_array,
-                                  const int16_t *row_min2_array,
-                                  const int16_t *row_sign_array) {
-  // Loop through the Z rows in the layer (check node m)
-  for (uint32_t zb = 0; zb < d->z; ++zb) {
-    const int16_t *l_ptr = l + zb * d->num_cols;
-    // Loop through the columns in the row (variable node n in psi(m))
-    for (uint32_t col = 0; col < d->num_cols; ++col) {
-      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
-      int16_t col_sign = (row_sign_array[zb] ^ l_ptr[col]) < 0 ? -1 : 1;
-
-      // Compute R(n,m)
-      int16_t abs_val = sat_abs_16(l_ptr[col]);
-      int16_t r_val =
-          col_sign * (abs_val == row_min_array[zb] ? row_min2_array[zb]
-                                                   : row_min_array[zb]);
-
-      // Compute LLR(n) = R(n,m) + L(n,m)
-      auto shift = (d->shift_ptr[col] + zb) % d->z;
-      auto col_ind = d->col_ptr[col] * d->z + shift;
-      llrs[col_ind] = sat_add_16(r_val, l_ptr[col]);
-
-      // Store R(n,m) for the next iteration
-      r[col] = r_val;
-    }
-  }
-}
-
-template<>
-void compute_r_and_llrs<CAT_TAIL>(const int16_t *l, int16_t *r, int16_t *llrs,
-                                  const ldpc_layer_data *d, int32_t num_lanes,
-                                  int32_t full_vec, uint32_t tail_size,
-                                  const int16_t *row_min_array,
-                                  const int16_t *row_min2_array,
-                                  const int16_t *row_sign_array) {
-  // Case for lifting sizes 4 <= Z < 8 (rows in the layer)
-  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
-
-  Vec_i16 row_min = no_sanitize::MaskedLoad(pg_tail, di16, row_min_array);
-  Vec_i16 row_min2 = no_sanitize::MaskedLoad(pg_tail, di16, row_min2_array);
-  Vec_i16 row_sign = no_sanitize::MaskedLoad(pg_tail, di16, row_sign_array);
-
-  // Loop through the columns in the row (variable node n in psi(m))
-  for (uint32_t col = 0; col < d->num_cols; ++col) {
-    auto shift = d->shift_ptr[col] % d->z;
-    auto col_ind = d->col_ptr[col] * (2 * d->z);
-    int16_t *r_ptr = r + d->z * col;
-    const int16_t *l_ptr = l + d->z * col;
-    int16_t *llrs_ptr = llrs + col_ind + shift;
-
-    // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
-    Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr);
-    Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-    Vec_i16 eor_reg = hn::Xor(row_sign, l_reg);
-    Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16));
-
-    // Compute R(n,m)
-    Mask_i16 pg_tail_eq = hn::Eq(abs_reg, row_min);
-    Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, row_min2, row_min);
-    Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg);
-
-    // Compute LLR(n) = R(n,m) + L(n,m)
-    Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
-    hn::StoreN(result, di16, llrs_ptr, tail_size);
-
-    // Store R(n,m) for the next iteration
-    hn::StoreN(r_reg, di16, r_ptr, tail_size);
-
-    // Rearrange LLRs
-    memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t));
-    // copy (z - shift) elts in the main block to the replicated block
-    memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift,
-           (d->z - shift) * sizeof(int16_t));
-  }
-}
-
-template<>
-void compute_r_and_llrs<CAT_LARGE>(const int16_t *l, int16_t *r, int16_t *llrs,
-                                   const ldpc_layer_data *d, int32_t num_lanes,
-                                   int32_t full_vec, uint32_t tail_size,
-                                   const int16_t *row_min_array,
-                                   const int16_t *row_min2_array,
-                                   const int16_t *row_sign_array) {
-  Mask_i16 pg_tail = hn::FirstN(di16, tail_size);
-
-  // Loop through the columns in the row (variable node n in psi(m))
-  for (uint32_t col = 0; col < d->num_cols; ++col) {
-    auto shift = d->shift_ptr[col] % d->z;
-    auto col_ind = d->col_ptr[col] * (2 * d->z);
-    int16_t *llrs_ptr = llrs + col_ind + shift;
-    const int16_t *l_ptr = l + d->z * col;
-    int16_t *r_ptr = r + d->z * col;
-    const int16_t *sign_ptr = row_sign_array;
-    const int16_t *min_ptr = row_min_array;
-    const int16_t *min2_ptr = row_min2_array;
-
-    // Loop through the Z rows in the layer (check node m)
-    for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) {
-      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
-      Vec_i16 l_reg = hn::LoadU(di16, l_ptr);
-      Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr);
-      Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg);
-      Mask_i16 pg_neg = hn::Lt(eor_reg, hn::Zero(di16));
-
-      // Compute R(n,m)
-      Vec_i16 min_reg = hn::LoadU(di16, min_ptr);
-      Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr);
-      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-      Mask_i16 pg_eq = hn::Eq(abs_reg, min_reg);
-      Vec_i16 tmp_reg = hn::IfThenElse(pg_eq, min2_reg, min_reg);
-      Vec_i16 r_reg = hn::IfThenElse(pg_neg, hn::Neg(tmp_reg), tmp_reg);
-
-      // Compute LLR(n) = R(n,m) + L(n,m)
-      Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
-      hn::StoreU(result, di16, llrs_ptr);
-
-      // Store R(n,m) for the next iteration
-      hn::StoreU(r_reg, di16, r_ptr);
-
-      // Increment pointers
-      l_ptr += num_lanes;
-      r_ptr += num_lanes;
-      llrs_ptr += num_lanes;
-      sign_ptr += num_lanes;
-      min_ptr += num_lanes;
-      min2_ptr += num_lanes;
-    }
-
-    if (tail_size != 0) {
-      // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product)
-      Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr);
-      Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr);
-      Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg);
-      Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16));
-
-      // Compute R(n,m)
-      Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr);
-      Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr);
-      Vec_i16 abs_reg = hn::SaturatedAbs(l_reg);
-      Mask_i16 pg_tail_eq = hn::Eq(abs_reg, min_reg);
-      Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, min2_reg, min_reg);
-      Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg);
-
-      // Compute LLR(n) = R(n,m) + L(n,m)
-      Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg);
-      hn::StoreN(result, di16, llrs_ptr, tail_size);
-
-      // Store R(n,m) for the next iteration
-      hn::StoreN(r_reg, di16, r_ptr, tail_size);
-    }
-
-    // Rearrange LLRs
-    // copy shifted elements in the replicated block
-    // back to the beginning of the main block
-    memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t));
-    // copy (z - shift) elts in the main block to the replicated block
-    memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift,
-           (d->z - shift) * sizeof(int16_t));
-  }
-}
-
-template<lifting_size_category Cat, typename Allocator>
-void __attribute__((flatten))
-run_iterations(uint32_t num_its, uint32_t z, uint32_t lsi,
-               const armral_ldpc_base_graph_t *graph, int16_t *r, int16_t *l,
-               int16_t *new_llrs, int32_t num_lanes, int32_t full_vec,
-               uint32_t tail_size, int16_t *row_min_array,
-               int16_t *row_min2_array, int16_t *row_sign_array, int16_t *check,
-               bool check_convergence,
-               std::optional<crc_checker<Allocator>> &crc_checker) {
-  for (uint32_t i = 0; i < num_its; ++i) {
-    ldpc_layer_data d(z, lsi, graph);
-    auto *r_ptr = r;
-
-    // Loop through the layers (groups of Z rows)
-    compute_l_product_min1_and_min2<Cat>(l, new_llrs, r_ptr, &d, num_lanes,
-                                         full_vec, tail_size, row_min_array,
-                                         row_min2_array, row_sign_array);
-    compute_r_and_llrs<Cat>(l, r_ptr, new_llrs, &d, num_lanes, full_vec,
-                            tail_size, row_min_array, row_min2_array,
-                            row_sign_array);
-
-    for (uint32_t row = 1; row < graph->nrows; ++row) {
-      d.next();
-      r_ptr = r + d.row_start_ind * z;
-
-      // Variable-to-check node messages update
-      compute_l_product_min1_and_min2<Cat>(l, new_llrs, r_ptr, &d, num_lanes,
-                                           full_vec, tail_size, row_min_array,
-                                           row_min2_array, row_sign_array);
-      // LLRs update
-      compute_r_and_llrs<Cat>(l, r_ptr, new_llrs, &d, num_lanes, full_vec,
-                              tail_size, row_min_array, row_min2_array,
-                              row_sign_array);
-    }
-
-    // CRC check and early termination
-    bool crc_passed = crc_checker.has_value() && crc_checker->check(new_llrs);
-    if (check_convergence &&
-        (crc_passed || parity_check<Cat>(new_llrs, z, lsi, graph, num_lanes,
-                                         full_vec, tail_size, check))) {
-      break;
-    }
-  }
-}
-
-} // anonymous namespace
-
-template<bool check_convergence, typename Allocator>
-void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg,
-                                uint32_t z, uint32_t crc_idx, uint32_t num_its,
-                                uint8_t *data_out, Allocator &allocator) {
-  // Get the base graph and the lifting size
-  const auto *graph = armral_ldpc_get_base_graph(bg);
-  uint32_t lsi = get_lifting_index(z);
-
-  // Only allocate the CRC checker if necessary.
-  std::optional<crc_checker<Allocator>> maybe_crc_checker;
-  if (crc_idx != ARMRAL_LDPC_NO_CRC) {
-    maybe_crc_checker = crc_checker{z, crc_idx, allocator};
-  }
-
-  const uint32_t num_llrs = (graph->ncodeword_bits + 2) * z;
-
-  // Assign memory for the things that we need
-  // We know that the first block rows have the largest number of non-zero
-  // entries, so the largest layer will be for the first block rows. In
-  // particular, for both base graphs, the second row is of longest length.
-  uint32_t mat_size = graph->row_start_inds[graph->nrows] * z;
-  uint32_t layer_size =
-      (graph->row_start_inds[2] - graph->row_start_inds[1]) * z;
-  // We need to keep a record of matrix L (variable-to-check-node messages)
-  auto l = allocate_uninitialized<int16_t>(allocator, layer_size);
-  // We need to keep a record of matrix R (check-to-variable-node messages)
-  auto r = allocate_zeroed<int16_t>(allocator, mat_size);
-
-  auto row_min_array = allocate_zeroed<int16_t>(allocator, z);
-  auto row_min2_array = allocate_zeroed<int16_t>(allocator, z);
-  auto row_sign_array = allocate_zeroed<int16_t>(allocator, z);
-
-  auto check = allocate_zeroed<int16_t>(allocator, z);
-
-  // Scalar CAT_TINY tails are less efficient than processing as single,
-  // partial vector instruction. This is simply disabled currently.
-  bool z_is_tiny = 0;
-
-  // Keep a record of the current, and previous values of the LLRs
-  // Copy the inputs LLRs
-  const auto *llrs_ptr = llrs;
-  size_t new_llrs_size = num_llrs;
-  std::optional<unique_ptr<Allocator, int16_t>> maybe_out_llrs;
-  if (!z_is_tiny) {
-    // Double the storage required to replicate LLRs for optimization
-    new_llrs_size *= 2;
-    // Extra buffer to pack the LLRs again
-    maybe_out_llrs = allocate_uninitialized<int16_t>(allocator, num_llrs);
-  }
-  auto new_llrs = allocate_uninitialized<int16_t>(allocator, new_llrs_size);
-
-  // NOTE: All allocations are now done!
-  if constexpr (Allocator::is_counting) {
-    return;
-  }
-
-  if (z_is_tiny) {
-    // Set the value of the current LLRs from the ones passed in.
-    // We need to take account of the punctured columns.
-    // Also widen to int16_t for use in intermediate calculations.
-    memset(new_llrs.get(), 0, 2 * z * sizeof(int16_t));
-    for (uint32_t i = 0; i < z * graph->ncodeword_bits; i++) {
-      new_llrs[2 * z + i] = (int16_t)llrs[i];
-    }
-  } else {
-    // Each block of Z elements replicated b1|b1|b2|b2 ...
-    // We need to take account of the punctured columns.
-    // Also widen to int16_t for use in intermediate calculations.
-    memset(new_llrs.get(), 0, 4 * z * sizeof(int16_t));
-    auto *new_llrs_ptr = &new_llrs[4 * z];
-    for (uint32_t num_block = 0; num_block < graph->ncodeword_bits;
-         num_block++) {
-      for (uint32_t i = 0; i < z; i++) {
-        new_llrs_ptr[i] = (int16_t)llrs_ptr[i];
-        new_llrs_ptr[z + i] = (int16_t)llrs_ptr[i];
-      }
-      new_llrs_ptr += 2 * z;
-      llrs_ptr += z;
-    }
-  }
-
-  // Precompute number of full vector and tail
-  int32_t num_lanes = hn::Lanes(di16);
-  int32_t full_vec = z / num_lanes;
-  uint32_t tail_size = z % num_lanes;
-  bool is_tail_only = (tail_size == z && !z_is_tiny);
-
-  if (z_is_tiny) {
-    run_iterations<CAT_TINY>(num_its, z, lsi, graph, r.get(), l.get(),
-                             new_llrs.get(), num_lanes, full_vec, tail_size,
-                             row_min_array.get(), row_min2_array.get(),
-                             row_sign_array.get(), check.get(),
-                             check_convergence, maybe_crc_checker);
-
-    // Hard decode into the output variable
-    llrs_to_bits(num_llrs, new_llrs.get(), data_out);
-  } else {
-    if (is_tail_only) {
-      run_iterations<CAT_TAIL>(num_its, z, lsi, graph, r.get(), l.get(),
-                               new_llrs.get(), num_lanes, full_vec, tail_size,
-                               row_min_array.get(), row_min2_array.get(),
-                               row_sign_array.get(), check.get(),
-                               check_convergence, maybe_crc_checker);
-    } else {
-      run_iterations<CAT_LARGE>(num_its, z, lsi, graph, r.get(), l.get(),
-                                new_llrs.get(), num_lanes, full_vec, tail_size,
-                                row_min_array.get(), row_min2_array.get(),
-                                row_sign_array.get(), check.get(),
-                                check_convergence, maybe_crc_checker);
-    }
-    // Pack LLRs, copy back to original storage
-    auto *out_llrs = maybe_out_llrs.value().get();
-    for (uint32_t num_block = 0; num_block < graph->ncodeword_bits + 2;
-         num_block++) {
-      memcpy(out_llrs + num_block * z, &new_llrs[2 * num_block * z],
-             z * sizeof(int16_t));
-    }
-
-    // Hard decode into the output variable
-    llrs_to_bits(num_llrs, out_llrs, data_out);
-  }
-}
-
-template void armral::ldpc::decode_block<false, heap_allocator>(
-    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
-    uint32_t num_its, uint8_t *data_out, heap_allocator &);
-
-template void armral::ldpc::decode_block<false, buffer_bump_allocator>(
-    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
-    uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &);
-
-armral_status armral_ldpc_decode_block(const int8_t *llrs,
-                                       armral_ldpc_graph_t bg, uint32_t z,
-                                       uint32_t crc_idx, uint32_t num_its,
-                                       uint8_t *data_out) {
-  heap_allocator allocator{};
-  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
-                                   allocator);
-  return ARMRAL_SUCCESS;
-}
-
-armral_status
-armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg,
-                                 uint32_t z, uint32_t crc_idx, uint32_t num_its,
-                                 uint8_t *data_out, void *buffer) {
-  buffer_bump_allocator allocator{buffer};
-  armral::ldpc::decode_block<true>(llrs, bg, z, crc_idx, num_its, data_out,
-                                   allocator);
-  return ARMRAL_SUCCESS;
-}
-
-uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
-                                                      uint32_t z,
-                                                      uint32_t crc_idx,
-                                                      uint32_t num_its) {
-  counting_allocator allocator{};
-  armral::ldpc::decode_block<true>(nullptr, bg, z, crc_idx, num_its, nullptr,
-                                   allocator);
-  return allocator.required_bytes();
-}
diff --git a/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp
index e7a0a3aa0e4eb1eb4e3bbdf761977f6894373725..b136b4c155f2afc5d4331eeb474d6e14771e0b95 100644
--- a/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp
+++ b/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp
@@ -414,7 +414,7 @@ inline void combine_seq_out<4, 8>(const uint8_t *seq1, const uint8_t *seq2,
   Vec_u16x8 in1 = hn::LoadU(du16x8, (const uint16_t *)seq1);
   Vec_u16x8 in2 = hn::LoadU(du16x8, (const uint16_t *)seq2);
 
-  Vec_u8x16 h = hn::Combine(du8x16, hn::Set(du8x8, 0), hn::Load(du8x8, hist2));
+  Vec_u8x16 h = hn::Combine(du8x16, hn::Set(du8x8, 0), hn::LoadU(du8x8, hist2));
   h = hn::InterleaveWholeLower(du8x16, h, h);
   Vec_u8x16 h_ofs0 = hn::Dup128VecFromValues(du8x16, 0, 1, 0, 1, 0, 1, 0, 1, 0,
                                              1, 0, 1, 0, 1, 0, 1);
@@ -571,4 +571,4 @@ inline void combine_hist<1>(const uint8_t * /*hist1*/,
   // nothing to do if L=1, only one choice of history.
 }
 
-} // namespace
\ No newline at end of file
+} // namespace
diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp
index 40eb8950c0181834798bed0df7dd4024fdd6e985..3fbfc23b7867b5be9c3521dc767223eff3945f2c 100644
--- a/src/utils/highway/bits_to_bytes.hpp
+++ b/src/utils/highway/bits_to_bytes.hpp
@@ -42,7 +42,8 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in,
       // Generate an index to select this byte pair
       Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind));
       Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices);
-      // Shift the bits we want to convert into the rightmost position and mask out the higher bits
+      // Shift the bits we want to convert into the rightmost position and mask
+      // out the higher bits
       Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
       hn::StoreU(spread_bits, du8x16, out);
       out += hn::Lanes(du8x16);
@@ -56,12 +57,14 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in,
     // Load partial vector of remaining bytes
     Vec_u8x16 bytes = no_sanitize::MaskedLoad(load_mask, du8x16, in);
 
-    // We can process two bytes at once, stopping once we reach the end of the data
+    // We can process two bytes at once, stopping once we reach the end of the
+    // data
     for (size_t byte_ind = 0; byte_ind < remaining_bytes; byte_ind += 2) {
       // Generate an index to select this byte pair
       Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind));
       Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices);
-      // Shift the bits we want to convert into the rightmost position and mask out the higher bits
+      // Shift the bits we want to convert into the rightmost position and mask
+      // out the higher bits
       Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts));
       bool store_remainder = (byte_ind + 2) >= remaining_bytes;
       hn::StoreN(spread_bits, du8x16, out,
diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp
index 9c9a022e3a6510078b0a5b18b324ba80717723a4..4dc43710b11c670318cf34f5a6ec3d0b4c8ad012 100644
--- a/src/utils/hwy_types.hpp
+++ b/src/utils/hwy_types.hpp
@@ -56,6 +56,8 @@ using Vec_f64x2 = hn::Vec<decltype(df64x2)>;
 using Mask_u8x16 = hn::Mask<decltype(du8x16)>;
 using Mask_i8x16 = hn::Mask<decltype(di8x16)>;
 using Mask_u32x4 = hn::Mask<decltype(du32x4)>;
+using Mask_i16x8 = hn::Mask<decltype(di16x8)>;
+using Mask_u16x8 = hn::Mask<decltype(du16x8)>;
 
 /*
 Full64 Vector Types.
@@ -194,7 +196,7 @@ https://google.github.io/highway/en/master/faq.html#correctness
 However their solution is to make use of less optimal code which makes
 it unfeasible.
 */
-#if HWY_IS_ASAN && HWY_TARGET == HWY_NEON
+#if HWY_IS_ASAN && HWY_TARGET & (HWY_ALL_NEON | HWY_SSE4 | HWY_AVX2)
 namespace no_sanitize {
 #define NO_ASAN __attribute__((no_sanitize("address")))
 
@@ -207,8 +209,9 @@ NoASANMaskedLoad(M m, D d, const hn::TFromD<D> *HWY_RESTRICT unaligned) {
 template<class D, class M>
 hn::VFromD<D> MaskedLoad(M m, D d,
                          const hn::TFromD<D> *HWY_RESTRICT unaligned) {
-  // Dereference the pointer to allow the AdressSantizer to add acheck that it is valid.
-  // Test the first data lane as well as the last active lane, derived from the mask provided
+  // Dereference the pointer to allow the AdressSantizer to add acheck that it
+  // is valid. Test the first data lane as well as the last active lane, derived
+  // from the mask provided
   volatile auto test_first_lane = *unaligned;
   volatile auto test_last_active_lane = *(unaligned + FindLastTrue(d, m));
   (void)test_first_lane;