diff --git a/CREDITS.md b/CREDITS.md index 9d655731b27cd29a3715e75b58f09d69839183ff..b5287fd24ecea884fb8108f90e207b1b390cf76c 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -2,6 +2,21 @@ In addition to the primary development being done by Arm, the following people and organizations have contributed to Arm RAN Acceleration Library: +- The following Google Highway implementations have been updated: + `highway/arm_ldpc_decoder.cpp`, + have been contributed by Cambridge Consultants. See + . + +- The following Google Highway implementations: + `highway/arm_turbo_decoder_batch.hpp`, + `highway/arm_turbo_decoder_single.hpp`, + `highway/arm_polar_decoder.cpp`, + `highway/arm_polar_encoder.cpp`, + `highway/arm_polar_frozen_bits.cpp`, + `highway/fft_*` + have been contributed by Cambridge Consultants. See + . + - The following Google Highway implementations: `highway/arm_scrambling.cpp`, `highway/arm_mat_seq_generator.cpp`, diff --git a/armral_hwy.cmake b/armral_hwy.cmake index 58221bb6db9a86d017a6777fb75b40d4c7b5be26..e7f92a530ce8b1d6516d47e79d9d9a33d5280b8e 100644 --- a/armral_hwy.cmake +++ b/armral_hwy.cmake @@ -88,6 +88,16 @@ set_property( PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE2_128|HWY_SVE2|HWY_SVE_256|HWY_SVE) +# The LDPC decoder requires [S]VQRDMULH instruction which is only available +# under NEON and SVE2 on aarch64, therefore, we have disabled SVE for all Arm +# platforms when VQRDMULH is required; to avoid falling back to (slower) generic +# implementations. +set_property( + SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp + APPEND + PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE_256|HWY_SVE) + # The Turbo decoder implementation does not support scalable vectors and is # memory access heavy. The overhead of implicit masking when using fixed 128-bit # vectors causes a ~60% overhead. Benchmarks include the batch and single header @@ -193,8 +203,8 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_decoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/highway/arm_convolutional_encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/highway/arm_demodulation.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/highway/arm_modulation.cpp diff --git a/include/armral.h b/include/armral.h index 434f980951fcac57d74d58467c385779bc3e3910..5c9810af6e8084a0c3d8ba5310c15c7eae04cb73 100644 --- a/include/armral.h +++ b/include/armral.h @@ -95,8 +95,8 @@ #pragma GCC diagnostic pop #endif #else -// GCC sometimes complains about declaration shadowing members in arm_neon-inl.h. -// nothing we can do about that, so ignore it! +// GCC sometimes complains about declaration shadowing members in +// arm_neon-inl.h. nothing we can do about that, so ignore it! #ifndef __clang__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp index 284cbc2ada90147ffaa6ee34c054db43cc963dc1..a5c8f9a961e5af35f40e84f80f479f4e3241150d 100644 --- a/src/UpperPHY/CRC/highway/crc_common.hpp +++ b/src/UpperPHY/CRC/highway/crc_common.hpp @@ -11,8 +11,8 @@ namespace hn = hwy::HWY_NAMESPACE; -// Allow compilation on non-arm architectures by aliasing poly64_t to an existing type -// Test if arm_neon.h has been included +// Allow compilation on non-arm architectures by aliasing poly64_t to an +// existing type Test if arm_neon.h has been included #ifndef _AARCH64_NEON_H_ using poly64_t = uint64_t; #endif @@ -23,7 +23,8 @@ using poly64_t = uint64_t; // vaddq_p64 = Xor // vrev64q_u8 = Reverse8 // vld1q_p64 = LoadU -// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced pointer +// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced +// pointer template static inline Vec_u64x2 load_p64x2(const uint64_t *p_in) { diff --git a/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07ec726b5fc4ad09d91af42e2672cd6a7f63bc79 --- /dev/null +++ b/src/UpperPHY/LDPC/highway/arm_ldpc_decoder.cpp @@ -0,0 +1,812 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ + +#include "../ldpc_coding.hpp" +#include "armral.h" +#include "utils/allocators.hpp" +#include "utils/bits_to_bytes.hpp" + +#include +#include + +#include +#include +#include + +namespace hn = hwy::HWY_NAMESPACE; + +namespace armral::ldpc { + +// Check nodes process the received information, update it, and send it back to +// the connected variable nodes. +// l is updated belief. +// r is extrinsic information. +// min values, signs and sign products are passed as input argument to update +// the belief and store the extrinsic information. +HWY_FORCED_INLINE void +update_l_and_r(int16_t *__restrict__ l, int16_t *__restrict__ r, + const armral_ldpc_base_graph_t *graph, uint16_t z, uint32_t lsi, + uint16_t layer, const int16_t *__restrict__ row_min1_array, + const int16_t *__restrict__ row_min2_array, + const int16_t *__restrict__ row_sign_array, + const uint16_t *__restrict__ row_pos_array, + const int16_t *__restrict__ sign_scratch, + uint32_t *__restrict__ r_index) { + + const uint32_t *col_indices; + uint32_t i; + uint32_t j; + uint32_t r_i = *r_index; + uint32_t num_lanes = hn::Lanes(di16x8); + + i = graph->row_start_inds[layer]; + // Get the number of nonzero entries in the row + j = graph->row_start_inds[layer + 1] - i; + col_indices = graph->col_inds + i; + const uint32_t *shift_ptr = graph->shifts + i * 8 + lsi * j; + + const int16_t *sgn_scratch_buf = sign_scratch; + + // for each column i.e only non -1's + for (uint16_t col = 0; col < j; col++) { + uint32_t col_block = col_indices[col]; + + int16_t *ptr_r = &r[r_i * z]; + uint32_t shift = shift_ptr[col] % z; + + const int16_t *min1_buf = row_min1_array; + const int16_t *min2_buf = row_min2_array; + const int16_t *sgn_buf = row_sign_array; // set to 0 + const uint16_t *pos_buf = row_pos_array; + + Vec_u16x8 pos_current = hn::Set(du16x8, col); + + uint32_t blk1 = (z - shift) / num_lanes; + uint32_t blk2 = shift / num_lanes; + uint32_t tail1 = (z - shift) & (num_lanes - 1); + uint32_t tail2 = (shift) & (num_lanes - 1); + Mask_i16x8 pg_tail1 = hn::FirstN(di16x8, tail1); + Mask_i16x8 pg_tail2 = hn::FirstN(di16x8, tail2); + + // Loop over z + // shift to z-1 + int16_t *ptr_l = &l[col_block * z + shift]; // Input,point to shift3 + + for (uint32_t v_cnt = 0; v_cnt < blk1; v_cnt++) { + Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf); + Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf); + Vec_u16x8 pos = hn::LoadU(du16x8, pos_buf); + + // check if this the column matching position for the min1 + Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current)); + + // if yes replace min1 with min2, otherwise min1 + Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1); + + // apply sign + Vec_i16x8 signs = hn::LoadU(di16x8, sgn_scratch_buf); + merged_mins = hn::Mul(merged_mins, signs); + + // apply sign product + Vec_i16x8 sign_prod = hn::LoadU(di16x8, sgn_buf); + merged_mins = hn::Mul(merged_mins, sign_prod); + + // update r + hn::StoreU(merged_mins, di16x8, ptr_r); + + // update l + Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l); + llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins); + hn::StoreU(llrs_reg, di16x8, ptr_l); + + ptr_l += num_lanes; + ptr_r += num_lanes; + sgn_scratch_buf += num_lanes; + sgn_buf += num_lanes; + min1_buf += num_lanes; + min2_buf += num_lanes; + pos_buf += num_lanes; + } + + if (tail1 > 0U) { + Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min1_buf); + Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min2_buf); + Vec_u16x8 pos = no_sanitize::MaskedLoad(hn::RebindMask(du16x8, pg_tail1), + du16x8, pos_buf); + // check if this the column matching position for the min1 + Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current)); + + // if yes replace min1 with min2, otherwise min1 + Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1); + + // apply sign + Vec_i16x8 signs = + no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_scratch_buf); + merged_mins = hn::Mul(merged_mins, signs); + + // apply sign product + Vec_i16x8 sign_prod = no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_buf); + merged_mins = hn::Mul(merged_mins, sign_prod); + + // update r + hn::StoreN(merged_mins, di16x8, ptr_r, tail1); + + // update l + Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_l); + llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins); + hn::StoreN(llrs_reg, di16x8, ptr_l, tail1); + ptr_l += tail1; + ptr_r += tail1; + sgn_scratch_buf += tail1; + sgn_buf += tail1; + min1_buf += tail1; + min2_buf += tail1; + pos_buf += tail1; + } + + // 0 to shift-1 + ptr_l = &l[col_block * z]; // point to start + for (uint32_t v_cnt = 0; v_cnt < blk2; v_cnt++) { + + Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf); + Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf); + Vec_u16x8 pos = hn::LoadU(du16x8, pos_buf); + + // check if this the column matching position for the min1 + Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current)); + + // if yes replace min1 with min2, otherwise min1 + Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1); + + // apply sign + Vec_i16x8 signs = hn::LoadU(di16x8, sgn_scratch_buf); + merged_mins = hn::Mul(merged_mins, signs); + + // apply sign product + Vec_i16x8 sign_prod = hn::LoadU(di16x8, sgn_buf); + merged_mins = hn::Mul(merged_mins, sign_prod); + + // update r + hn::StoreU(merged_mins, di16x8, ptr_r); + + // update l + Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l); + llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins); + hn::StoreU(llrs_reg, di16x8, ptr_l); + + ptr_l += num_lanes; + ptr_r += num_lanes; + sgn_scratch_buf += num_lanes; + sgn_buf += num_lanes; + min1_buf += num_lanes; + min2_buf += num_lanes; + pos_buf += num_lanes; + } + + if (tail2 > 0U) { + Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min1_buf); + Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min2_buf); + Vec_u16x8 pos = no_sanitize::MaskedLoad(hn::RebindMask(du16x8, pg_tail2), + du16x8, pos_buf); + + // check if this the column matches position for the min1 + Mask_i16x8 pos_mask = hn::RebindMask(di16x8, hn::Eq(pos, pos_current)); + + // if yes replace min1 with min2, otherwise min1 + Vec_i16x8 merged_mins = hn::IfThenElse(pos_mask, min2, min1); + + // apply sign + Vec_i16x8 signs = + no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_scratch_buf); + merged_mins = hn::Mul(merged_mins, signs); + + // apply sign product + Vec_i16x8 sign_prod = no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_buf); + merged_mins = hn::Mul(merged_mins, sign_prod); + + // update r + hn::StoreN(merged_mins, di16x8, ptr_r, tail2); + + // update l + Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_l); + llrs_reg = hn::SaturatedAdd(llrs_reg, merged_mins); + hn::StoreN(llrs_reg, di16x8, ptr_l, tail2); + + ptr_l += tail2; + ptr_r += tail2; + sgn_scratch_buf += tail2; + sgn_buf += tail2; + min1_buf += tail2; + min2_buf += tail2; + pos_buf += tail2; + } + + r_i++; + } + + // update r index for next layer + *r_index = r_i; +} + +// Variable nodes transmit their belief information to the connected check +// nodes. Decoding alogrithm implemented is scaled offset min-sum. outputs mins, +// signs and sign product to update the total belief. +HWY_FORCED_INLINE void compute_l_r_and_mins( + int16_t *__restrict__ l, int16_t *__restrict__ r, + const armral_ldpc_base_graph_t *graph, uint16_t z, uint32_t lsi, + uint16_t layer, int16_t *__restrict__ row_min1_array, + int16_t *__restrict__ row_min2_array, int16_t *__restrict__ row_sign_array, + uint16_t *__restrict__ row_pos_array, int16_t *__restrict__ sign_scratch, + uint32_t *__restrict__ r_index) { + + const uint32_t *col_indices; + uint32_t i; + uint32_t j; + uint32_t r_i = *r_index; + uint32_t t_i = 0; + uint32_t num_lanes = hn::Lanes(di16x8); + + i = graph->row_start_inds[layer]; + // Get the number of nonzero entries in the row + j = graph->row_start_inds[layer + 1] - i; + col_indices = graph->col_inds + i; + const uint32_t *shift_ptr = graph->shifts + i * 8 + lsi * j; + + int16_t *sgn_scratch_buf = sign_scratch; + + Vec_i16x8 offset8 = hn::Set(di16x8, 2); + + Vec_i16x8 plus1 = hn::Set(di16x8, 1); + Vec_i16x8 minus1 = hn::Set(di16x8, -1); + + // for each column i.e only non -1's + for (uint32_t col = 0; col < j; col++) { + uint32_t col_block = col_indices[col]; + + int16_t *ptr_r = &r[r_i * z]; + uint32_t shift = shift_ptr[col] % z; + + uint32_t blk1 = (z - shift) / num_lanes; + uint32_t blk2 = shift / num_lanes; + uint32_t tail1 = (z - shift) & (num_lanes - 1); + uint32_t tail2 = (shift) & (num_lanes - 1); + Mask_i16x8 pg_tail1 = hn::FirstN(di16x8, tail1); + Mask_i16x8 pg_tail2 = hn::FirstN(di16x8, tail2); + + int16_t *min1_buf = row_min1_array; + int16_t *min2_buf = row_min2_array; + int16_t *sgn_buf = row_sign_array; // set to 0 + uint16_t *pos_buf = row_pos_array; + + // Loop over z + // shift to z-1 + int16_t *ptr_l = &l[col_block * z + shift]; // Input,point to shift + + for (uint32_t v_cnt = 0; v_cnt < blk1; v_cnt++) { + Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l); + Vec_i16x8 r_reg = hn::LoadU(di16x8, ptr_r); + + // Subtraction + Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg); + + // Absoluate + Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16); + + // Store signs + Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1); + hn::StoreU(signs, di16x8, sgn_scratch_buf); + + // Sign product + Vec_i16x8 old_sgn = hn::LoadU(di16x8, sgn_buf); + Vec_i16x8 sgn = hn::Mul(signs, old_sgn); + // store updated sign + hn::StoreU(sgn, di16x8, sgn_buf); + + // store updated L + hn::StoreU(vec16, di16x8, ptr_l); + + // Find min1 and min2 + Vec_i16x8 min1_old = hn::LoadU(di16x8, min1_buf); + Vec_i16x8 min2_old = hn::LoadU(di16x8, min2_buf); + + Vec_i16x8 min2 = hn::Max(min1_old, hn::Min(min2_old, abs_vec16)); + Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old); + + // find min1 position + // check if the current min1 has changed w.r.t previous + // if it has changed, then update the index to current pos + Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old)); + Vec_u16x8 pos_old = hn::LoadU(du16x8, pos_buf); + Vec_u16x8 pos_cur = hn::Set(du16x8, col); + Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur); + + hn::StoreU(min2, di16x8, min2_buf); + hn::StoreU(min1, di16x8, min1_buf); + hn::StoreU(pos_updt, du16x8, pos_buf); + + ptr_l += num_lanes; + ptr_r += num_lanes; + min1_buf += num_lanes; + min2_buf += num_lanes; + sgn_buf += num_lanes; + pos_buf += num_lanes; + sgn_scratch_buf += num_lanes; + } + + if (tail1 > 0U) { + Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_l); + Vec_i16x8 r_reg = no_sanitize::MaskedLoad(pg_tail1, di16x8, ptr_r); + + // Subtraction + Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg); + + // Absolute + Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16); + + // Store signs + Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1); + hn::StoreN(signs, di16x8, sgn_scratch_buf, tail1); + + // Sign product + Vec_i16x8 old_sgn = no_sanitize::MaskedLoad(pg_tail1, di16x8, sgn_buf); + Vec_i16x8 sgn = hn::Mul(signs, old_sgn); + // store updated sign + hn::StoreN(sgn, di16x8, sgn_buf, tail1); + + // store updated L + hn::StoreN(vec16, di16x8, ptr_l, tail1); + + // Find min1 and min2 + Vec_i16x8 min1_old = no_sanitize::MaskedLoad(pg_tail1, di16x8, min1_buf); + Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail1, di16x8, min2_buf); + + min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16)); + Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old); + + // Find min1 pos + // check if the current min1 has changed w.r.t previous + // if it has changed, then update the index to current pos + Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old)); + Vec_u16x8 pos_old = no_sanitize::MaskedLoad( + hn::RebindMask(du16x8, pg_tail1), du16x8, pos_buf); + Vec_u16x8 pos_cur = hn::Set(du16x8, col); + Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur); + + hn::StoreN(min2, di16x8, min2_buf, tail1); + hn::StoreN(min1, di16x8, min1_buf, tail1); + hn::StoreN(pos_updt, du16x8, pos_buf, tail1); + + ptr_l += tail1; + ptr_r += tail1; + min1_buf += tail1; + min2_buf += tail1; + sgn_buf += tail1; + pos_buf += tail1; + sgn_scratch_buf += tail1; + } + // 0 to shift-1 + ptr_l = &l[col_block * z]; // point to start + for (uint32_t v_cnt = 0; v_cnt < blk2; v_cnt++) { + Vec_i16x8 llrs_reg = hn::LoadU(di16x8, ptr_l); + Vec_i16x8 r_reg = hn::LoadU(di16x8, ptr_r); + + // Subtraction + Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg); + + // Absoluate + Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16); + + // Store signs + Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1); + hn::StoreU(signs, di16x8, sgn_scratch_buf); + + // Sign product + Vec_i16x8 old_sgn = hn::LoadU(di16x8, sgn_buf); + Vec_i16x8 sgn = hn::Mul(signs, old_sgn); + // store updated sign + hn::StoreU(sgn, di16x8, sgn_buf); + + // store updated L + hn::StoreU(vec16, di16x8, ptr_l); + + // Find min1 and min2 + Vec_i16x8 min1_old = hn::LoadU(di16x8, min1_buf); + Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf); + + min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16)); + Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old); + + // find min1 position + // check if the current min1 has changed w.r.t previous + // if it has changed, then update the index to current pos + Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old)); + Vec_u16x8 pos_old = hn::LoadU(du16x8, pos_buf); + Vec_u16x8 pos_cur = hn::Set(du16x8, col); + Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur); + + hn::StoreU(min2, di16x8, min2_buf); + hn::StoreU(min1, di16x8, min1_buf); + hn::StoreU(pos_updt, du16x8, pos_buf); + + ptr_l += num_lanes; + ptr_r += num_lanes; + min1_buf += num_lanes; + min2_buf += num_lanes; + sgn_buf += num_lanes; + pos_buf += num_lanes; + sgn_scratch_buf += num_lanes; + } + + if (tail2 > 0U) { + Vec_i16x8 llrs_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_l); + Vec_i16x8 r_reg = no_sanitize::MaskedLoad(pg_tail2, di16x8, ptr_r); + + // Subtraction + Vec_i16x8 vec16 = hn::SaturatedSub(llrs_reg, r_reg); + + // Absolute + Vec_i16x8 abs_vec16 = hn::SaturatedAbs(vec16); + + // Store signs + Vec_i16x8 signs = hn::IfNegativeThenElse(vec16, minus1, plus1); + hn::StoreN(signs, di16x8, sgn_scratch_buf, tail2); + + // Sign product + Vec_i16x8 old_sgn = no_sanitize::MaskedLoad(pg_tail2, di16x8, sgn_buf); + Vec_i16x8 sgn = hn::Mul(signs, old_sgn); + // store updated sign + hn::StoreN(sgn, di16x8, sgn_buf, tail2); + + // store updated L + hn::StoreN(vec16, di16x8, ptr_l, tail2); + + // Find min1 and min2 + Vec_i16x8 min1_old = no_sanitize::MaskedLoad(pg_tail2, di16x8, min1_buf); + Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail2, di16x8, min2_buf); + + min2 = hn::Max(min1_old, hn::Min(min2, abs_vec16)); + Vec_i16x8 min1 = hn::Min(abs_vec16, min1_old); + + // Find min1 pos + // check if the current min1 has changed w.r.t previous + // if it has changed, then update the index to current pos + Mask_u16x8 pos_mask = hn::RebindMask(du16x8, hn::Eq(min1, min1_old)); + Vec_u16x8 pos_old = no_sanitize::MaskedLoad( + hn::RebindMask(du16x8, pg_tail2), du16x8, pos_buf); + Vec_u16x8 pos_cur = hn::Set(du16x8, col); + Vec_u16x8 pos_updt = hn::IfThenElse(pos_mask, pos_old, pos_cur); + + hn::StoreN(min2, di16x8, min2_buf, tail2); + hn::StoreN(min1, di16x8, min1_buf, tail2); + hn::StoreN(pos_updt, du16x8, pos_buf, tail2); + + ptr_l += tail2; + ptr_r += tail2; + min1_buf += tail2; + min2_buf += tail2; + sgn_buf += tail2; + pos_buf += tail2; + sgn_scratch_buf += tail2; + } + r_i++; + t_i++; + } + + *r_index = r_i - t_i; + + // offset and scale min1 and min2 + // in the same loop, adjust sign product + uint32_t blk = z / num_lanes; + uint32_t tail = z & (num_lanes - 1); + Mask_i16x8 pg_tail = hn::FirstN(di16x8, tail); + + Vec_i16x8 scale = hn::Set(di16x8, 24576); // 0.75 + int16_t *min1_buf = row_min1_array; + int16_t *min2_buf = row_min2_array; + int16_t *sgn_buf = row_sign_array; + + for (uint16_t z1 = 0; z1 < blk; z1++) { + Vec_i16x8 sgn = hn::LoadU(di16x8, sgn_buf); + sgn = hn::IfNegativeThenElse(sgn, minus1, plus1); + hn::StoreU(sgn, di16x8, sgn_buf); + + Vec_i16x8 min1 = hn::LoadU(di16x8, min1_buf); + Vec_i16x8 min2 = hn::LoadU(di16x8, min2_buf); + + // apply offset + Vec_i16x8 vec1 = hn::SaturatedSub(min1, offset8); + Vec_i16x8 vec2 = hn::SaturatedSub(min2, offset8); + + // if min1 < 0, then min1 = 0; + vec1 = hn::IfNegativeThenElse(vec1, hn::Zero(di16x8), vec1); + + // apply scale + min1 = hn::MulFixedPoint15(vec1, scale); + + // if min2 < 0, then min1 = 0; + vec2 = hn::IfNegativeThenElse(vec2, hn::Zero(di16x8), vec2); + + // apply scale + min2 = hn::MulFixedPoint15(vec2, scale); + + // store scaled, offseted min's + hn::StoreU(min1, di16x8, min1_buf); + hn::StoreU(min2, di16x8, min2_buf); + + min1_buf += num_lanes; + min2_buf += num_lanes; + sgn_buf += num_lanes; + } + + if (tail > 0U) { + Vec_i16x8 sgn = no_sanitize::MaskedLoad(pg_tail, di16x8, sgn_buf); + sgn = hn::IfNegativeThenElse(sgn, minus1, plus1); + hn::StoreN(sgn, di16x8, sgn_buf, tail); + + Vec_i16x8 min1 = no_sanitize::MaskedLoad(pg_tail, di16x8, min1_buf); + Vec_i16x8 min2 = no_sanitize::MaskedLoad(pg_tail, di16x8, min2_buf); + + // apply offset + Vec_i16x8 vec1 = hn::SaturatedSub(min1, offset8); + Vec_i16x8 vec2 = hn::SaturatedSub(min2, offset8); + + // if min1 < 0, then min1 = 0 + vec1 = hn::IfNegativeThenElse(vec1, hn::Zero(di16x8), vec1); + + // apply scale + min1 = hn::MulFixedPoint15(vec1, scale); + + vec2 = hn::IfNegativeThenElse(vec2, hn::Zero(di16x8), vec2); + + min2 = hn::MulFixedPoint15(vec2, scale); + + hn::StoreN(min1, di16x8, min1_buf, tail); + hn::StoreN(min2, di16x8, min2_buf, tail); + min1_buf += tail; + min2_buf += tail; + sgn_buf += tail; + } +} + +HWY_FORCED_INLINE bool hard_decision(int16_t *ptr_l, uint8_t *crc_buff, + uint8_t *ptr_data, uint32_t k, + uint32_t crc_flag) { + + uint32_t num_lanes = hn::Lanes(di16x8); + uint32_t k_prime = k + 24; + uint32_t full_vec = (k_prime) / num_lanes; + uint32_t tail_cnt = (k_prime) & (num_lanes - 1); + uint8_t *data = (uint8_t *)crc_buff; + uint32_t pad_bytes = 0; + + // if the decoded data is less than 8 bytes / not multiple of 8 bytes, prefix + // zero padding + if (crc_flag != 0U) { + if (((k_prime >> 3) % 16) != 0U) { + pad_bytes = 16 - ((k_prime >> 3) % 16); + memset(data, 0, pad_bytes); + data = data + pad_bytes; + } + } + + const Vec_i16x8 ones = + hn::Dup128VecFromValues(di16x8, 128, 64, 32, 16, 8, 4, 2, 1); + Mask_i16x8 pg_tail = hn::FirstN(di16x8, tail_cnt); + + for (uint32_t v_cnt = 0; v_cnt < full_vec; v_cnt++) { + Vec_i16x8 d = hn::LoadU(di16x8, ptr_l); + + Vec_u16x8 is_negative = + hn::BitCast(du16x8, hn::IfNegativeThenElseZero(d, ones)); + uint8_t byte1 = (uint8_t)hn::ReduceSum(du16x8, is_negative); + + *data++ = byte1; + ptr_l += num_lanes; + } + + if (tail_cnt != 0U) { + Vec_i16x8 d = no_sanitize::MaskedLoad(pg_tail, di16x8, ptr_l); + Vec_u16x8 is_negative = + hn::BitCast(du16x8, hn::IfNegativeThenElseZero(d, ones)); + uint8_t byte1 = (uint8_t)hn::ReduceSum(du16x8, is_negative); + *data++ = byte1; + ptr_l += tail_cnt; + } + + // Generate the CRC parity bits + uint64_t crc = 0; + if (crc_flag != 0U) { + armral_crc24_b_be((k_prime >> 3) + pad_bytes, (const uint64_t *)crc_buff, + &crc); + // Removing the Zero padding + if (pad_bytes != 0U) { + for (uint32_t i = 0; i < (k_prime >> 3); i++) { + ptr_data[i] = crc_buff[i + pad_bytes]; + } + } + } else { + memcpy(ptr_data, crc_buff, (k + 7) >> 3); + } + + return (crc == 0U); +} + +HWY_FORCED_INLINE void load_ptr_l(int16_t *ptr_l, const int8_t *llrs_ptr, + uint32_t len_in) { + uint32_t num_lanes = hn::Lanes(di16x8); + uint32_t full_blk = len_in / num_lanes; + uint32_t tail_cnt = len_in % num_lanes; + + for (uint32_t num_block = 0; num_block < full_blk; num_block++) { + Vec_i16x8 vec_16 = hn::PromoteTo(di16x8, hn::LoadU(di8x8, llrs_ptr)); + hn::StoreU(vec_16, di16x8, ptr_l); + ptr_l += num_lanes; + llrs_ptr += num_lanes; + } + + if (tail_cnt != 0U) { + for (uint32_t i = 0; i < tail_cnt; i++) { + ptr_l[i] = (int16_t)llrs_ptr[i]; + } + } +} + +template +bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, + uint32_t crc_idx, uint32_t num_its, uint8_t *data_out, + Allocator &allocator) { + + bool crc_passed = false; + + // Get the base graph and the lifting size + const auto *graph = armral_ldpc_get_base_graph(bg); + uint32_t lsi = get_lifting_index(z); + + // (graph->row_start_inds[2] - graph->row_start_inds[1]) Max no of non -1 in + // columns is 19. Note: Min is calculated over 8 byte block lengths, so need + // lesser memory + uint32_t layer_size = 19 * z; + + uint32_t num_llrs = (graph->ncodeword_bits + 2) * z; + // max no of non zeros enteries in H Matrix is 316 + uint32_t r_mat_size = 316 * z; + + auto l = allocate_uninitialized(allocator, num_llrs); + // matrix R (check-to-variable-node messages) + auto r = allocate_zeroed(allocator, r_mat_size); + + uint32_t num_lanes = hn::Lanes(di16x8); + uint32_t z_len = z / num_lanes; + uint32_t offset = (z % num_lanes) ? 1 : 0; + z_len = (z_len + offset) * num_lanes; + uint32_t k = (bg == 0) ? (z * 22) : (z * 10); + + auto row_min1_array = allocate_uninitialized(allocator, z_len); + auto row_min2_array = allocate_uninitialized(allocator, z_len); + auto row_sign_array = allocate_zeroed(allocator, z_len); + auto row_pos_array = allocate_zeroed(allocator, z); + auto sign_scratch = allocate_uninitialized(allocator, layer_size); + auto crc_buff = allocate_zeroed(allocator, ((k + 7) >> 3) + 15); + + // NOTE: All allocations are now done! + if constexpr (Allocator::is_counting) { + return false; + } + + uint32_t r_index = 0; + + // initialization with channel LLRs. 16-bit buffer "l" will be used for + // in-place calculations + int16_t *ptr_l = l.get(); + const auto *llrs_ptr = llrs; + + // 0 memset 2z LLRs from input to fill the punctured bits + memset(ptr_l, 0, sizeof(int16_t) * 2 * z); + ptr_l = ptr_l + 2 * z; + + load_ptr_l(ptr_l, llrs_ptr, graph->ncodeword_bits * z); + + uint32_t full_blk = z_len / num_lanes; + + for (uint32_t it = 0; it < num_its; ++it) { + r_index = 0; + for (uint32_t layer = 0; layer < graph->nrows; layer++) { + + // reset the sign buffer + memset(row_sign_array.get(), 0, sizeof(int16_t) * z); + + // reset the min1 min2 buf to max + int16_t *ptr1 = row_min1_array.get(); + int16_t *ptr2 = row_min2_array.get(); + int16_t *ptr3 = row_sign_array.get(); + + for (uint32_t i = 0; i < full_blk; i++) { + Vec_i16x8 v8 = hn::Set(di16x8, 0x7FFF); + Vec_i16x8 v_sign8 = hn::Set(di16x8, 0x1); + hn::StoreU(v8, di16x8, ptr1); + hn::StoreU(v8, di16x8, ptr2); + hn::StoreU(v_sign8, di16x8, ptr3); + + ptr1 += num_lanes; + ptr2 += num_lanes; + ptr3 += num_lanes; + } + + compute_l_r_and_mins(l.get(), r.get(), graph, z, lsi, layer, + row_min1_array.get(), row_min2_array.get(), + row_sign_array.get(), row_pos_array.get(), + sign_scratch.get(), &r_index); + + update_l_and_r(l.get(), r.get(), graph, z, lsi, layer, + row_min1_array.get(), row_min2_array.get(), + row_sign_array.get(), row_pos_array.get(), + sign_scratch.get(), &r_index); + } + + // early exit if crc Passes + if (crc_idx) { + if (it < (num_its - 1)) { + crc_passed = + hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true); + if (crc_passed) { + return crc_passed; + } + } + } + } + + if (crc_idx == ARMRAL_LDPC_NO_CRC) { // do only decisions + crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0], + graph->nmessage_bits * z, false); + } else { + crc_passed = + hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true); + } + return crc_passed; +} + +} // namespace armral::ldpc + +template bool armral::ldpc::decode_block( + const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx, + uint32_t num_its, uint8_t *data_out, heap_allocator &); + +template bool armral::ldpc::decode_block( + const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx, + uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &); + +armral_status armral_ldpc_decode_block(const int8_t *llrs, + armral_ldpc_graph_t bg, uint32_t z, + uint32_t crc_idx, uint32_t num_its, + uint8_t *data_out) { + + heap_allocator allocator{}; + bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its, + data_out, allocator); + return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL; +} + +armral_status +armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg, + uint32_t z, uint32_t crc_idx, uint32_t num_its, + uint8_t *data_out, void *buffer) { + + buffer_bump_allocator allocator{buffer}; + bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its, + data_out, allocator); + return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL; +} + +uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, + uint32_t z, + uint32_t crc_idx, + uint32_t num_its) { + counting_allocator allocator{}; + armral::ldpc::decode_block(nullptr, bg, z, crc_idx, num_its, nullptr, + allocator); + return allocator.required_bytes(); +} diff --git a/src/UpperPHY/LDPC/highway/ldpc_encoder.cpp b/src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp similarity index 100% rename from src/UpperPHY/LDPC/highway/ldpc_encoder.cpp rename to src/UpperPHY/LDPC/highway/arm_ldpc_encoder.cpp diff --git a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp b/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp deleted file mode 100644 index ea6d98441e944d258b085b7947c4085a201fedc4..0000000000000000000000000000000000000000 --- a/src/UpperPHY/LDPC/highway/ldpc_decoder.cpp +++ /dev/null @@ -1,967 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its - affiliates - SPDX-License-Identifier: BSD-3-Clause -*/ - -#include "../ldpc_coding.hpp" -#include "utils/allocators.hpp" -#include "utils/bits_to_bytes.hpp" - -#include "utils/hwy_types.hpp" -namespace hn = hwy::HWY_NAMESPACE; - -#include -#include -#include - -namespace { -using Mask_i16 = hn::Mask; - -inline int16_t __attribute__((always_inline)) sat_abs_16(int16_t a) { - int16_t partial_res = abs((int32_t)a); - if (partial_res > INT8_MAX) { - return INT8_MAX; - } - if (partial_res < INT8_MIN) { - return INT8_MIN; - } - return partial_res; -} - -inline int16_t __attribute__((always_inline)) sat_add_16(int16_t a, int16_t b) { - int16_t partial_res = (uint32_t)a + (uint32_t)b; - if (partial_res > INT8_MAX) { - return INT8_MAX; - } - if (partial_res < INT8_MIN) { - return INT8_MIN; - } - return partial_res; -} - -inline int16_t __attribute__((always_inline)) sat_sub_16(int16_t a, int16_t b) { - int16_t partial_res = (int32_t)a - (int32_t)b; - if (partial_res > INT8_MAX) { - return INT8_MAX; - } - if (partial_res < INT8_MIN) { - return INT8_MIN; - } - return partial_res; -} - -struct ldpc_layer_data { - uint32_t z; - uint32_t lsi; - uint32_t row; - uint32_t row_start_ind; - const armral_ldpc_base_graph_t *graph; - uint32_t num_cols; - const uint32_t *shift_ptr; - const uint32_t *col_ptr; - - ldpc_layer_data(uint32_t z_in, uint32_t lsi_in, - const armral_ldpc_base_graph_t *graph_in) - : z(z_in), lsi(lsi_in), row(0), row_start_ind(0), graph(graph_in), - num_cols(graph->row_start_inds[1]), - shift_ptr(graph->shifts + lsi * num_cols), col_ptr(graph->col_inds) {} - - void next() { - row++; - row_start_ind = graph->row_start_inds[row]; - col_ptr += num_cols; - num_cols = graph->row_start_inds[row + 1] - row_start_ind; - shift_ptr = graph->shifts + row_start_ind * armral::ldpc::num_lifting_sets + - lsi * num_cols; - } -}; - -template -inline T max(T a, T b) { - return a > b ? a : b; -} - -template -inline T min(T a, T b) { - return a < b ? a : b; -} - -enum lifting_size_category { CAT_TINY, CAT_TAIL, CAT_LARGE }; - -template -class crc_checker { -public: - crc_checker(uint32_t z, uint32_t crc_idx, Allocator &allocator) : m_z(z) { - // Calculate K', which is the number of info bits + CRC bits (i.e. the - // non-filler bits of the code block) - m_k_prime = crc_idx + 24; - - // The CRC calculation routine expects a particular size of input (n % 16 = 0 - // where n is the number of bytes), which requires padding the input to the - // required size - m_buffer_size = (m_k_prime + 7) / 8; - m_total_bits = m_k_prime; - if (m_k_prime % 128 != 0) { - m_num_pad_bits = 128 - (m_k_prime % 128); - m_total_bits = m_k_prime + m_num_pad_bits; - m_buffer_size = m_total_bits >> 3; - } - - m_llrs = allocate_uninitialized(allocator, m_total_bits + m_z - 1); - m_buffer = allocate_uninitialized(allocator, m_buffer_size); - } - - bool check(const int16_t *new_llrs) { - // Copy the LLRs corresponding to the bits we need to do the CRC check after - // the padding bits - memset(m_llrs.get(), 0, m_num_pad_bits * sizeof(int16_t)); - for (uint32_t num_block = 0; num_block < ((m_k_prime + m_z - 1) / m_z); - num_block++) { - memcpy(m_llrs.get() + m_num_pad_bits + (num_block * m_z), - new_llrs + (2 * num_block * m_z), m_z * sizeof(int16_t)); - } - - // Hard decode - armral::llrs_to_bits(m_total_bits, m_llrs.get(), m_buffer.get()); - - // Generate the CRC parity bits - uint64_t crc; - armral_crc24_b_be(m_buffer_size, (const uint64_t *)m_buffer.get(), &crc); - - // If the CRC is zero then the code block has been correctly decoded and we - // can terminate the iterations early - return (crc == 0); - } - -private: - uint32_t m_z{0}; - uint32_t m_k_prime{0}; - uint32_t m_buffer_size{0}; - uint32_t m_num_pad_bits{0}; - uint32_t m_total_bits{0}; - unique_ptr m_llrs; - unique_ptr m_buffer; -}; - -template -bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi, - const armral_ldpc_base_graph_t *graph, int32_t num_lanes, - int32_t full_vec, uint32_t tail_size, int16_t *check); - -template<> -bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi, - const armral_ldpc_base_graph_t *graph, - int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *check) { - // Loop through the rows in the base graph - bool passed = true; - for (uint32_t row = 0; row < graph->nrows && passed; ++row) { - auto row_start_ind = graph->row_start_inds[row]; - auto num_cols = graph->row_start_inds[row + 1] - row_start_ind; - const auto *col_ptr = graph->col_inds + row_start_ind; - const auto *shift_ptr = graph->shifts + - row_start_ind * armral::ldpc::num_lifting_sets + - lsi * num_cols; - // Loop through the rows in the block - for (uint32_t zb = 0; zb < z && passed; ++zb) { - // Loop through the columns in the row - int16_t scal_check = 0; - for (uint32_t col = 0; col < num_cols; ++col) { - auto shift = (shift_ptr[col] + zb) % z; - auto codeword_ind = col_ptr[col] * z + shift; - scal_check ^= llrs[codeword_ind]; - } - passed &= scal_check >= 0; - } - } - return passed; -} - -template<> -bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi, - const armral_ldpc_base_graph_t *graph, - int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *check) { - // Loop through the rows in the base graph - bool passed = true; - Mask_i16 pg_tail = hn::FirstN(di16, (size_t)tail_size); - - for (uint32_t row = 0; row < graph->nrows && passed; ++row) { - auto row_start_ind = graph->row_start_inds[row]; - auto num_cols = graph->row_start_inds[row + 1] - row_start_ind; - const auto *col_ptr = graph->col_inds + row_start_ind; - const auto *shift_ptr = graph->shifts + - row_start_ind * armral::ldpc::num_lifting_sets + - lsi * num_cols; - memset(check, 0, z * sizeof(int16_t)); - - // Loop through the columns - for (uint32_t col = 0; col < num_cols; ++col) { - auto shift = (shift_ptr[col] % z); - auto codeword_ind = col_ptr[col] * (2 * z) + shift; - - // No need to loop here, as there is only a tail - const int16_t *llrs_ptr = llrs + codeword_ind; - int16_t *check_ptr = check; - - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr); - Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg); - hn::StoreN(result_reg, di16, check_ptr, tail_size); - } - for (uint32_t zb = 0; zb < z && passed; ++zb) { - passed &= check[zb] >= 0; - } - } - return passed; -} - -template<> -bool parity_check(const int16_t *llrs, uint32_t z, uint32_t lsi, - const armral_ldpc_base_graph_t *graph, - int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *check) { - Mask_i16 pg_tail = hn::FirstN(di16, tail_size); - - // Loop through the rows in the base graph - bool passed = true; - for (uint32_t row = 0; row < graph->nrows && passed; ++row) { - auto row_start_ind = graph->row_start_inds[row]; - auto num_cols = graph->row_start_inds[row + 1] - row_start_ind; - const auto *col_ptr = graph->col_inds + row_start_ind; - const auto *shift_ptr = graph->shifts + - row_start_ind * armral::ldpc::num_lifting_sets + - lsi * num_cols; - memset(check, 0, z * sizeof(int16_t)); - - // Loop through the columns - for (uint32_t col = 0; col < num_cols; ++col) { - auto shift = (shift_ptr[col] % z); - auto codeword_ind = col_ptr[col] * (2 * z) + shift; - // Loop through the rows in the block - - // The check can be done on the LLRs instead of on the bit values, as - // there is a one-to-one transform between LLRs and bit. Negative LLRs - // represent a hard decision for the bit to be one, and non-negative - // values represent a zero. Hence the check needs to xor all LLRs - // and then assert that the result is non-negative. - const int16_t *llrs_ptr = llrs + codeword_ind; - int16_t *check_ptr = check; - - for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr); - Vec_i16 check_reg = hn::LoadU(di16, check_ptr); - Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg); - hn::StoreU(result_reg, di16, check_ptr); - - // Increment pointers - llrs_ptr += num_lanes; - check_ptr += num_lanes; - } - // Process tail - if (tail_size != 0) { - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 check_reg = no_sanitize::MaskedLoad(pg_tail, di16, check_ptr); - Vec_i16 result_reg = hn::Xor(check_reg, llrs_reg); - hn::StoreN(result_reg, di16, check_ptr, tail_size); - } - } - for (uint32_t zb = 0; zb < z && passed; ++zb) { - passed &= check[zb] >= 0; - } - } - return passed; -} - -// For each check node m in the layer, compute: -// - the variable-to-check-node messages L(n,m) for each variable node n in -// \psi(m), where \psi(m) is the set of variable nodes connected to m: -// L(n,m) = LLR(n) - R(n,m) -// - the products \prod_{n' \in \psi(m)} L(n',m) (they will be used to compute -// sign(R(n,m)) in a second step) -// - \min_{n \in \psi(m)} |L(n,m)| and the second minimum (they will be used to -// compute |R(n,m)| in a second step) -template -void compute_l_product_min1_and_min2( - int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r, - const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array, - int16_t *row_sign_array); - -template<> -void compute_l_product_min1_and_min2( - int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r, - const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array, - int16_t *row_sign_array) { - const auto *r_ptr = r; - // Loop through the Z rows in the layer (check node m) - for (uint32_t zb = 0; zb < d->z; ++zb) { - // Loop through the columns in the row (variable node n in psi(m)) - // Column 0 - auto shift = (d->shift_ptr[0] + zb) % d->z; - int16_t l_val = sat_sub_16(llrs[d->col_ptr[0] * d->z + shift], *(r_ptr++)); - - int16_t row_sign = l_val; - - int16_t row_min = sat_abs_16(l_val); - - *(l++) = l_val; - - // Column 1 - shift = (d->shift_ptr[1] + zb) % d->z; - l_val = sat_sub_16(llrs[d->col_ptr[1] * d->z + shift], *(r_ptr++)); - - row_sign ^= l_val; - - int16_t abs_val = sat_abs_16(l_val); - int16_t row_min2 = max(row_min, abs_val); - row_min = min(row_min, abs_val); - - *(l++) = l_val; - - // Columns n >= 2 - for (uint32_t col = 2; col < d->num_cols; ++col) { - // Compute L(n,m) = LLR(n) - R(n,m) - shift = (d->shift_ptr[col] + zb) % d->z; - l_val = sat_sub_16(llrs[d->col_ptr[col] * d->z + shift], *(r_ptr++)); - - // Compute the product of L(n',m), for all the columns (all n' in psi(m)) - row_sign ^= l_val; - - // Compute the min(|L(n,m)|) and the second minimum - abs_val = sat_abs_16(l_val); - row_min2 = max(row_min, min(row_min2, abs_val)); - row_min = min(row_min, abs_val); - - // Store L(n,m) - *(l++) = l_val; - } - - // Store the two minima and the product for Z rows - row_min_array[zb] = row_min; - row_min2_array[zb] = row_min2; - row_sign_array[zb] = row_sign; - } -} - -template<> -void compute_l_product_min1_and_min2( - int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r, - const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array, - int16_t *row_sign_array) { - // Case for lifting sizes Z such as 8 <= Z < 16 - Mask_i16 pg_tail = hn::FirstN(di16, tail_size); - - // Loop through the columns in the row (variable node n in psi(m)) - // Column 0 - int16_t *l_ptr = l; - auto shift = d->shift_ptr[0] % d->z; - const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift; - const int16_t *r_ptr = r; - - Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - Vec_i16 row_sign = l_reg; - - Vec_i16 row_min = hn::SaturatedAbs(l_reg); - - hn::StoreN(l_reg, di16, l_ptr, tail_size); - - // Column 1 - l_ptr = l + d->z; - shift = d->shift_ptr[1] % d->z; - llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift; - r_ptr = r + d->z; - - r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - row_sign = hn::Xor(row_sign, l_reg); - - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - Vec_i16 row_min2 = hn::Max(row_min, abs_reg); - row_min = hn::Min(row_min, abs_reg); - - hn::StoreN(l_reg, di16, l_ptr, tail_size); - - // Columns n >= 2 - for (uint32_t col = 2; col < d->num_cols; ++col) { - l_ptr = l + d->z * col; - shift = d->shift_ptr[col] % d->z; - llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift; - r_ptr = r + d->z * col; - - // Compute L(n,m) = LLR(n) - R(n,m) - r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - // Compute the product of L(n',m), for all the columns (all n' in psi(m)) - row_sign = hn::Xor(row_sign, l_reg); - - // Compute the min(|L(n,m)|) and the second minimum - abs_reg = hn::SaturatedAbs(l_reg); - row_min2 = hn::Max(row_min, hn::Min(row_min2, abs_reg)); - row_min = hn::Min(row_min, abs_reg); - - // Store L(n,m) - hn::StoreN(l_reg, di16, l_ptr, tail_size); - } - - // Store the two minima and the product for Z rows - hn::StoreN(row_min, di16, row_min_array, tail_size); - hn::StoreN(row_min2, di16, row_min2_array, tail_size); - hn::StoreN(row_sign, di16, row_sign_array, tail_size); -} - -template<> -void compute_l_product_min1_and_min2( - int16_t *l, const int16_t *__restrict__ llrs, const int16_t *__restrict__ r, - const ldpc_layer_data *d, int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *row_min_array, int16_t *row_min2_array, - int16_t *row_sign_array) { - Mask_i16 pg_tail = hn::FirstN(di16, tail_size); - - // Loop through the columns in the row (variable node n in psi(m)) - // Column 0 - int16_t *l_ptr = l; - auto shift = d->shift_ptr[0] % d->z; - const int16_t *llrs_ptr = llrs + d->col_ptr[0] * (2 * d->z) + shift; - const int16_t *r_ptr = r; - int16_t *sign_ptr = row_sign_array; - int16_t *min_ptr = row_min_array; - - for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - Vec_i16 r_reg = hn::LoadU(di16, r_ptr); - Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - hn::StoreU(l_reg, di16, sign_ptr); - - hn::StoreU(hn::SaturatedAbs(l_reg), di16, min_ptr); - - hn::StoreU(l_reg, di16, l_ptr); - - sign_ptr += num_lanes; - min_ptr += num_lanes; - r_ptr += num_lanes; - l_ptr += num_lanes; - llrs_ptr += num_lanes; - } - - if (tail_size != 0) { - Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - hn::StoreN(l_reg, di16, sign_ptr, tail_size); - - hn::StoreN(hn::SaturatedAbs(l_reg), di16, min_ptr, tail_size); - - hn::StoreN(l_reg, di16, l_ptr, tail_size); - } - - // Column 1 - shift = d->shift_ptr[1] % d->z; - l_ptr = l + d->z; - llrs_ptr = llrs + d->col_ptr[1] * (2 * d->z) + shift; - r_ptr = r + d->z; - sign_ptr = row_sign_array; - min_ptr = row_min_array; - int16_t *min2_ptr = row_min2_array; - - for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - Vec_i16 r_reg = hn::LoadU(di16, r_ptr); - Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr); - hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr); - - Vec_i16 min_reg = hn::LoadU(di16, min_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - hn::StoreU(hn::Max(min_reg, abs_reg), di16, min2_ptr); - hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr); - - hn::StoreU(l_reg, di16, l_ptr); - - sign_ptr += num_lanes; - min_ptr += num_lanes; - min2_ptr += num_lanes; - r_ptr += num_lanes; - l_ptr += num_lanes; - llrs_ptr += num_lanes; - } - - if (tail_size != 0) { - Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr); - hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size); - - Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - hn::StoreN(hn::Max(min_reg, abs_reg), di16, min2_ptr, tail_size); - hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size); - - hn::StoreN(l_reg, di16, l_ptr, tail_size); - } - - // Columns n >= 2 - for (uint32_t col = 2; col < d->num_cols; ++col) { - l_ptr = l + d->z * col; - shift = d->shift_ptr[col] % d->z; - llrs_ptr = llrs + d->col_ptr[col] * (2 * d->z) + shift; - r_ptr = r + d->z * col; - sign_ptr = row_sign_array; - min_ptr = row_min_array; - min2_ptr = row_min2_array; - - // Loop through the Z rows in the layer (check node m) - for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - // Compute L(n,m) = LLR(n) - R(n,m) - Vec_i16 r_reg = hn::LoadU(di16, r_ptr); - Vec_i16 llrs_reg = hn::LoadU(di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - // Compute the product of L(n',m), for all the columns (all n' in psi(m)) - Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr); - hn::StoreU(hn::Xor(sign_reg, l_reg), di16, sign_ptr); - - // Compute the min(|L(n,m)|) and the second minimum - Vec_i16 min_reg = hn::LoadU(di16, min_ptr); - Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - hn::StoreU(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr); - hn::StoreU(hn::Min(min_reg, abs_reg), di16, min_ptr); - - // Store L(n,m) - hn::StoreU(l_reg, di16, l_ptr); - - sign_ptr += num_lanes; - min_ptr += num_lanes; - min2_ptr += num_lanes; - r_ptr += num_lanes; - l_ptr += num_lanes; - llrs_ptr += num_lanes; - } - - // Process tail - if (tail_size != 0) { - Vec_i16 r_reg = no_sanitize::MaskedLoad(pg_tail, di16, r_ptr); - Vec_i16 llrs_reg = no_sanitize::MaskedLoad(pg_tail, di16, llrs_ptr); - Vec_i16 l_reg = hn::SaturatedSub(llrs_reg, r_reg); - - Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr); - hn::StoreN(hn::Xor(sign_reg, l_reg), di16, sign_ptr, tail_size); - - Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr); - Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - hn::StoreN(hn::Max(min_reg, hn::Min(min2_reg, abs_reg)), di16, min2_ptr, - tail_size); - hn::StoreN(hn::Min(min_reg, abs_reg), di16, min_ptr, tail_size); - - hn::StoreN(l_reg, di16, l_ptr, tail_size); - } - } -} - -// For each check node m in the layer, compute: -// - The check-to-variable-node messages R(n,m) for each n in \psi(m), where -// \psi(m) is the set of variable nodes connected to check node m: -// sign(R(n,m)) = \prod_{n' \in \psi(m)/n} sign(L(n',m)) = -// = \prod_{n' \in \psi(m)} sign(L(n',m)) / sign(L(n,m)) -// |R(n,m)| = \min_{n' \in \psi(m)/n} |L(n',m)| = -// = the first minimum when n' != n, the second minimum otherwise -// - The log likelihood ratios for each n in \psi(m): -// LLR(n) = R(n,m) + L(n,m) -template -void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, - const ldpc_layer_data *d, int32_t num_lanes, - int32_t full_vec, uint32_t tail_size, - const int16_t *row_min_array, - const int16_t *row_min2_array, - const int16_t *row_sign_array); - -template<> -void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, - const ldpc_layer_data *d, int32_t num_lanes, - int32_t full_vec, uint32_t tail_size, - const int16_t *row_min_array, - const int16_t *row_min2_array, - const int16_t *row_sign_array) { - // Loop through the Z rows in the layer (check node m) - for (uint32_t zb = 0; zb < d->z; ++zb) { - const int16_t *l_ptr = l + zb * d->num_cols; - // Loop through the columns in the row (variable node n in psi(m)) - for (uint32_t col = 0; col < d->num_cols; ++col) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) - int16_t col_sign = (row_sign_array[zb] ^ l_ptr[col]) < 0 ? -1 : 1; - - // Compute R(n,m) - int16_t abs_val = sat_abs_16(l_ptr[col]); - int16_t r_val = - col_sign * (abs_val == row_min_array[zb] ? row_min2_array[zb] - : row_min_array[zb]); - - // Compute LLR(n) = R(n,m) + L(n,m) - auto shift = (d->shift_ptr[col] + zb) % d->z; - auto col_ind = d->col_ptr[col] * d->z + shift; - llrs[col_ind] = sat_add_16(r_val, l_ptr[col]); - - // Store R(n,m) for the next iteration - r[col] = r_val; - } - } -} - -template<> -void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, - const ldpc_layer_data *d, int32_t num_lanes, - int32_t full_vec, uint32_t tail_size, - const int16_t *row_min_array, - const int16_t *row_min2_array, - const int16_t *row_sign_array) { - // Case for lifting sizes 4 <= Z < 8 (rows in the layer) - Mask_i16 pg_tail = hn::FirstN(di16, tail_size); - - Vec_i16 row_min = no_sanitize::MaskedLoad(pg_tail, di16, row_min_array); - Vec_i16 row_min2 = no_sanitize::MaskedLoad(pg_tail, di16, row_min2_array); - Vec_i16 row_sign = no_sanitize::MaskedLoad(pg_tail, di16, row_sign_array); - - // Loop through the columns in the row (variable node n in psi(m)) - for (uint32_t col = 0; col < d->num_cols; ++col) { - auto shift = d->shift_ptr[col] % d->z; - auto col_ind = d->col_ptr[col] * (2 * d->z); - int16_t *r_ptr = r + d->z * col; - const int16_t *l_ptr = l + d->z * col; - int16_t *llrs_ptr = llrs + col_ind + shift; - - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) - Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - Vec_i16 eor_reg = hn::Xor(row_sign, l_reg); - Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16)); - - // Compute R(n,m) - Mask_i16 pg_tail_eq = hn::Eq(abs_reg, row_min); - Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, row_min2, row_min); - Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg); - - // Compute LLR(n) = R(n,m) + L(n,m) - Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg); - hn::StoreN(result, di16, llrs_ptr, tail_size); - - // Store R(n,m) for the next iteration - hn::StoreN(r_reg, di16, r_ptr, tail_size); - - // Rearrange LLRs - memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t)); - // copy (z - shift) elts in the main block to the replicated block - memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift, - (d->z - shift) * sizeof(int16_t)); - } -} - -template<> -void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, - const ldpc_layer_data *d, int32_t num_lanes, - int32_t full_vec, uint32_t tail_size, - const int16_t *row_min_array, - const int16_t *row_min2_array, - const int16_t *row_sign_array) { - Mask_i16 pg_tail = hn::FirstN(di16, tail_size); - - // Loop through the columns in the row (variable node n in psi(m)) - for (uint32_t col = 0; col < d->num_cols; ++col) { - auto shift = d->shift_ptr[col] % d->z; - auto col_ind = d->col_ptr[col] * (2 * d->z); - int16_t *llrs_ptr = llrs + col_ind + shift; - const int16_t *l_ptr = l + d->z * col; - int16_t *r_ptr = r + d->z * col; - const int16_t *sign_ptr = row_sign_array; - const int16_t *min_ptr = row_min_array; - const int16_t *min2_ptr = row_min2_array; - - // Loop through the Z rows in the layer (check node m) - for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) - Vec_i16 l_reg = hn::LoadU(di16, l_ptr); - Vec_i16 sign_reg = hn::LoadU(di16, sign_ptr); - Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg); - Mask_i16 pg_neg = hn::Lt(eor_reg, hn::Zero(di16)); - - // Compute R(n,m) - Vec_i16 min_reg = hn::LoadU(di16, min_ptr); - Vec_i16 min2_reg = hn::LoadU(di16, min2_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - Mask_i16 pg_eq = hn::Eq(abs_reg, min_reg); - Vec_i16 tmp_reg = hn::IfThenElse(pg_eq, min2_reg, min_reg); - Vec_i16 r_reg = hn::IfThenElse(pg_neg, hn::Neg(tmp_reg), tmp_reg); - - // Compute LLR(n) = R(n,m) + L(n,m) - Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg); - hn::StoreU(result, di16, llrs_ptr); - - // Store R(n,m) for the next iteration - hn::StoreU(r_reg, di16, r_ptr); - - // Increment pointers - l_ptr += num_lanes; - r_ptr += num_lanes; - llrs_ptr += num_lanes; - sign_ptr += num_lanes; - min_ptr += num_lanes; - min2_ptr += num_lanes; - } - - if (tail_size != 0) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) - Vec_i16 l_reg = no_sanitize::MaskedLoad(pg_tail, di16, l_ptr); - Vec_i16 sign_reg = no_sanitize::MaskedLoad(pg_tail, di16, sign_ptr); - Vec_i16 eor_reg = hn::Xor(sign_reg, l_reg); - Mask_i16 pg_tail_neg = hn::Lt(eor_reg, hn::Zero(di16)); - - // Compute R(n,m) - Vec_i16 min_reg = no_sanitize::MaskedLoad(pg_tail, di16, min_ptr); - Vec_i16 min2_reg = no_sanitize::MaskedLoad(pg_tail, di16, min2_ptr); - Vec_i16 abs_reg = hn::SaturatedAbs(l_reg); - Mask_i16 pg_tail_eq = hn::Eq(abs_reg, min_reg); - Vec_i16 tmp_reg = hn::IfThenElse(pg_tail_eq, min2_reg, min_reg); - Vec_i16 r_reg = hn::IfThenElse(pg_tail_neg, hn::Neg(tmp_reg), tmp_reg); - - // Compute LLR(n) = R(n,m) + L(n,m) - Vec_i16 result = hn::SaturatedAdd(r_reg, l_reg); - hn::StoreN(result, di16, llrs_ptr, tail_size); - - // Store R(n,m) for the next iteration - hn::StoreN(r_reg, di16, r_ptr, tail_size); - } - - // Rearrange LLRs - // copy shifted elements in the replicated block - // back to the beginning of the main block - memcpy(llrs + col_ind, llrs + col_ind + d->z, shift * sizeof(int16_t)); - // copy (z - shift) elts in the main block to the replicated block - memcpy(llrs + col_ind + d->z + shift, llrs + col_ind + shift, - (d->z - shift) * sizeof(int16_t)); - } -} - -template -void __attribute__((flatten)) -run_iterations(uint32_t num_its, uint32_t z, uint32_t lsi, - const armral_ldpc_base_graph_t *graph, int16_t *r, int16_t *l, - int16_t *new_llrs, int32_t num_lanes, int32_t full_vec, - uint32_t tail_size, int16_t *row_min_array, - int16_t *row_min2_array, int16_t *row_sign_array, int16_t *check, - bool check_convergence, - std::optional> &crc_checker) { - for (uint32_t i = 0; i < num_its; ++i) { - ldpc_layer_data d(z, lsi, graph); - auto *r_ptr = r; - - // Loop through the layers (groups of Z rows) - compute_l_product_min1_and_min2(l, new_llrs, r_ptr, &d, num_lanes, - full_vec, tail_size, row_min_array, - row_min2_array, row_sign_array); - compute_r_and_llrs(l, r_ptr, new_llrs, &d, num_lanes, full_vec, - tail_size, row_min_array, row_min2_array, - row_sign_array); - - for (uint32_t row = 1; row < graph->nrows; ++row) { - d.next(); - r_ptr = r + d.row_start_ind * z; - - // Variable-to-check node messages update - compute_l_product_min1_and_min2(l, new_llrs, r_ptr, &d, num_lanes, - full_vec, tail_size, row_min_array, - row_min2_array, row_sign_array); - // LLRs update - compute_r_and_llrs(l, r_ptr, new_llrs, &d, num_lanes, full_vec, - tail_size, row_min_array, row_min2_array, - row_sign_array); - } - - // CRC check and early termination - bool crc_passed = crc_checker.has_value() && crc_checker->check(new_llrs); - if (check_convergence && - (crc_passed || parity_check(new_llrs, z, lsi, graph, num_lanes, - full_vec, tail_size, check))) { - break; - } - } -} - -} // anonymous namespace - -template -void armral::ldpc::decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, - uint32_t z, uint32_t crc_idx, uint32_t num_its, - uint8_t *data_out, Allocator &allocator) { - // Get the base graph and the lifting size - const auto *graph = armral_ldpc_get_base_graph(bg); - uint32_t lsi = get_lifting_index(z); - - // Only allocate the CRC checker if necessary. - std::optional> maybe_crc_checker; - if (crc_idx != ARMRAL_LDPC_NO_CRC) { - maybe_crc_checker = crc_checker{z, crc_idx, allocator}; - } - - const uint32_t num_llrs = (graph->ncodeword_bits + 2) * z; - - // Assign memory for the things that we need - // We know that the first block rows have the largest number of non-zero - // entries, so the largest layer will be for the first block rows. In - // particular, for both base graphs, the second row is of longest length. - uint32_t mat_size = graph->row_start_inds[graph->nrows] * z; - uint32_t layer_size = - (graph->row_start_inds[2] - graph->row_start_inds[1]) * z; - // We need to keep a record of matrix L (variable-to-check-node messages) - auto l = allocate_uninitialized(allocator, layer_size); - // We need to keep a record of matrix R (check-to-variable-node messages) - auto r = allocate_zeroed(allocator, mat_size); - - auto row_min_array = allocate_zeroed(allocator, z); - auto row_min2_array = allocate_zeroed(allocator, z); - auto row_sign_array = allocate_zeroed(allocator, z); - - auto check = allocate_zeroed(allocator, z); - - // Scalar CAT_TINY tails are less efficient than processing as single, - // partial vector instruction. This is simply disabled currently. - bool z_is_tiny = 0; - - // Keep a record of the current, and previous values of the LLRs - // Copy the inputs LLRs - const auto *llrs_ptr = llrs; - size_t new_llrs_size = num_llrs; - std::optional> maybe_out_llrs; - if (!z_is_tiny) { - // Double the storage required to replicate LLRs for optimization - new_llrs_size *= 2; - // Extra buffer to pack the LLRs again - maybe_out_llrs = allocate_uninitialized(allocator, num_llrs); - } - auto new_llrs = allocate_uninitialized(allocator, new_llrs_size); - - // NOTE: All allocations are now done! - if constexpr (Allocator::is_counting) { - return; - } - - if (z_is_tiny) { - // Set the value of the current LLRs from the ones passed in. - // We need to take account of the punctured columns. - // Also widen to int16_t for use in intermediate calculations. - memset(new_llrs.get(), 0, 2 * z * sizeof(int16_t)); - for (uint32_t i = 0; i < z * graph->ncodeword_bits; i++) { - new_llrs[2 * z + i] = (int16_t)llrs[i]; - } - } else { - // Each block of Z elements replicated b1|b1|b2|b2 ... - // We need to take account of the punctured columns. - // Also widen to int16_t for use in intermediate calculations. - memset(new_llrs.get(), 0, 4 * z * sizeof(int16_t)); - auto *new_llrs_ptr = &new_llrs[4 * z]; - for (uint32_t num_block = 0; num_block < graph->ncodeword_bits; - num_block++) { - for (uint32_t i = 0; i < z; i++) { - new_llrs_ptr[i] = (int16_t)llrs_ptr[i]; - new_llrs_ptr[z + i] = (int16_t)llrs_ptr[i]; - } - new_llrs_ptr += 2 * z; - llrs_ptr += z; - } - } - - // Precompute number of full vector and tail - int32_t num_lanes = hn::Lanes(di16); - int32_t full_vec = z / num_lanes; - uint32_t tail_size = z % num_lanes; - bool is_tail_only = (tail_size == z && !z_is_tiny); - - if (z_is_tiny) { - run_iterations(num_its, z, lsi, graph, r.get(), l.get(), - new_llrs.get(), num_lanes, full_vec, tail_size, - row_min_array.get(), row_min2_array.get(), - row_sign_array.get(), check.get(), - check_convergence, maybe_crc_checker); - - // Hard decode into the output variable - llrs_to_bits(num_llrs, new_llrs.get(), data_out); - } else { - if (is_tail_only) { - run_iterations(num_its, z, lsi, graph, r.get(), l.get(), - new_llrs.get(), num_lanes, full_vec, tail_size, - row_min_array.get(), row_min2_array.get(), - row_sign_array.get(), check.get(), - check_convergence, maybe_crc_checker); - } else { - run_iterations(num_its, z, lsi, graph, r.get(), l.get(), - new_llrs.get(), num_lanes, full_vec, tail_size, - row_min_array.get(), row_min2_array.get(), - row_sign_array.get(), check.get(), - check_convergence, maybe_crc_checker); - } - // Pack LLRs, copy back to original storage - auto *out_llrs = maybe_out_llrs.value().get(); - for (uint32_t num_block = 0; num_block < graph->ncodeword_bits + 2; - num_block++) { - memcpy(out_llrs + num_block * z, &new_llrs[2 * num_block * z], - z * sizeof(int16_t)); - } - - // Hard decode into the output variable - llrs_to_bits(num_llrs, out_llrs, data_out); - } -} - -template void armral::ldpc::decode_block( - const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx, - uint32_t num_its, uint8_t *data_out, heap_allocator &); - -template void armral::ldpc::decode_block( - const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx, - uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &); - -armral_status armral_ldpc_decode_block(const int8_t *llrs, - armral_ldpc_graph_t bg, uint32_t z, - uint32_t crc_idx, uint32_t num_its, - uint8_t *data_out) { - heap_allocator allocator{}; - armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its, data_out, - allocator); - return ARMRAL_SUCCESS; -} - -armral_status -armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg, - uint32_t z, uint32_t crc_idx, uint32_t num_its, - uint8_t *data_out, void *buffer) { - buffer_bump_allocator allocator{buffer}; - armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its, data_out, - allocator); - return ARMRAL_SUCCESS; -} - -uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg, - uint32_t z, - uint32_t crc_idx, - uint32_t num_its) { - counting_allocator allocator{}; - armral::ldpc::decode_block(nullptr, bg, z, crc_idx, num_its, nullptr, - allocator); - return allocator.required_bytes(); -} diff --git a/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp index e7a0a3aa0e4eb1eb4e3bbdf761977f6894373725..b136b4c155f2afc5d4331eeb474d6e14771e0b95 100644 --- a/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp +++ b/src/UpperPHY/Polar/highway/arm_polar_decoder_neon.hpp @@ -414,7 +414,7 @@ inline void combine_seq_out<4, 8>(const uint8_t *seq1, const uint8_t *seq2, Vec_u16x8 in1 = hn::LoadU(du16x8, (const uint16_t *)seq1); Vec_u16x8 in2 = hn::LoadU(du16x8, (const uint16_t *)seq2); - Vec_u8x16 h = hn::Combine(du8x16, hn::Set(du8x8, 0), hn::Load(du8x8, hist2)); + Vec_u8x16 h = hn::Combine(du8x16, hn::Set(du8x8, 0), hn::LoadU(du8x8, hist2)); h = hn::InterleaveWholeLower(du8x16, h, h); Vec_u8x16 h_ofs0 = hn::Dup128VecFromValues(du8x16, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); @@ -571,4 +571,4 @@ inline void combine_hist<1>(const uint8_t * /*hist1*/, // nothing to do if L=1, only one choice of history. } -} // namespace \ No newline at end of file +} // namespace diff --git a/src/utils/highway/bits_to_bytes.hpp b/src/utils/highway/bits_to_bytes.hpp index 40eb8950c0181834798bed0df7dd4024fdd6e985..3fbfc23b7867b5be9c3521dc767223eff3945f2c 100644 --- a/src/utils/highway/bits_to_bytes.hpp +++ b/src/utils/highway/bits_to_bytes.hpp @@ -42,7 +42,8 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, // Generate an index to select this byte pair Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind)); Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices); - // Shift the bits we want to convert into the rightmost position and mask out the higher bits + // Shift the bits we want to convert into the rightmost position and mask + // out the higher bits Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts)); hn::StoreU(spread_bits, du8x16, out); out += hn::Lanes(du8x16); @@ -56,12 +57,14 @@ HWY_FORCED_INLINE void bits_to_bytes(uint32_t n, const uint8_t *in, // Load partial vector of remaining bytes Vec_u8x16 bytes = no_sanitize::MaskedLoad(load_mask, du8x16, in); - // We can process two bytes at once, stopping once we reach the end of the data + // We can process two bytes at once, stopping once we reach the end of the + // data for (size_t byte_ind = 0; byte_ind < remaining_bytes; byte_ind += 2) { // Generate an index to select this byte pair Vec_u8x16 indices = hn::Add(base_indices, hn::Set(du8x16, byte_ind)); Vec_u8x16 repeated_bytes = hn::TableLookupBytes(bytes, indices); - // Shift the bits we want to convert into the rightmost position and mask out the higher bits + // Shift the bits we want to convert into the rightmost position and mask + // out the higher bits Vec_u8x16 spread_bits = hn::And(k1, hn::Shr(repeated_bytes, shifts)); bool store_remainder = (byte_ind + 2) >= remaining_bytes; hn::StoreN(spread_bits, du8x16, out, diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp index 9c9a022e3a6510078b0a5b18b324ba80717723a4..4dc43710b11c670318cf34f5a6ec3d0b4c8ad012 100644 --- a/src/utils/hwy_types.hpp +++ b/src/utils/hwy_types.hpp @@ -56,6 +56,8 @@ using Vec_f64x2 = hn::Vec; using Mask_u8x16 = hn::Mask; using Mask_i8x16 = hn::Mask; using Mask_u32x4 = hn::Mask; +using Mask_i16x8 = hn::Mask; +using Mask_u16x8 = hn::Mask; /* Full64 Vector Types. @@ -194,7 +196,7 @@ https://google.github.io/highway/en/master/faq.html#correctness However their solution is to make use of less optimal code which makes it unfeasible. */ -#if HWY_IS_ASAN && HWY_TARGET == HWY_NEON +#if HWY_IS_ASAN && HWY_TARGET & (HWY_ALL_NEON | HWY_SSE4 | HWY_AVX2) namespace no_sanitize { #define NO_ASAN __attribute__((no_sanitize("address"))) @@ -207,8 +209,9 @@ NoASANMaskedLoad(M m, D d, const hn::TFromD *HWY_RESTRICT unaligned) { template hn::VFromD MaskedLoad(M m, D d, const hn::TFromD *HWY_RESTRICT unaligned) { - // Dereference the pointer to allow the AdressSantizer to add acheck that it is valid. - // Test the first data lane as well as the last active lane, derived from the mask provided + // Dereference the pointer to allow the AdressSantizer to add acheck that it + // is valid. Test the first data lane as well as the last active lane, derived + // from the mask provided volatile auto test_first_lane = *unaligned; volatile auto test_last_active_lane = *(unaligned + FindLastTrue(d, m)); (void)test_first_lane;