diff --git a/bench/UpperPHY/LDPC/Decoding/main.cpp b/bench/UpperPHY/LDPC/Decoding/main.cpp
index 59840decef7bd3a1b98dde54e16ec7cdda5a4661..6a04de3e7d0c21c51b10051b3089a19165644e33 100755
--- a/bench/UpperPHY/LDPC/Decoding/main.cpp
+++ b/bench/UpperPHY/LDPC/Decoding/main.cpp
@@ -12,14 +12,15 @@
 
 namespace {
 
-void run_ldpc_decoding_perf(armral_ldpc_graph_t bg, uint32_t z,
-                            uint32_t crc_idx, uint32_t num_its,
+void run_ldpc_decoding_perf(uint32_t n, armral_ldpc_graph_t bg, uint32_t z,
+                            uint32_t len_filler_bits, uint32_t num_its,
+                            armral_ldpc_decode_options_t options,
                             uint32_t num_reps) {
   printf("[LDPC DECODING] - base graph = %u, lifting size = %u, number of "
          "decoding "
-         "iterations = %u, index where CRC information begins = %u , number of "
+         "iterations = %u, with length of filler bits = %u , number of "
          "repetitions = %u\n",
-         (uint32_t)bg, z, num_its, crc_idx, num_reps);
+         (uint32_t)bg, z, num_its, len_filler_bits, num_reps);
 
   const auto *graph = armral_ldpc_get_base_graph(bg);
 
@@ -31,18 +32,20 @@ void run_ldpc_decoding_perf(armral_ldpc_graph_t bg, uint32_t z,
 
 #ifdef ARMRAL_BENCH_NOALLOC
   auto buffer_size =
-      armral_ldpc_decode_block_noalloc_buffer_size(bg, z, crc_idx, num_its);
+      armral_ldpc_decode_block_noalloc_buffer_size(bg, z, num_its);
   std::vector<uint8_t> buffer(buffer_size);
   for (uint32_t r = 0; r < num_reps; ++r) {
     buffer_bump_allocator allocator{buffer.data()};
     armral::ldpc::decode_block<buffer_bump_allocator>(
-        llr_ptr, bg, z, crc_idx, num_its, out_ptr, allocator);
+        n, llr_ptr, bg, z, len_filler_bits, out_ptr, num_its, options,
+        allocator);
   }
 #else
   for (uint32_t r = 0; r < num_reps; ++r) {
     heap_allocator allocator{};
-    armral::ldpc::decode_block<heap_allocator>(llr_ptr, bg, z, crc_idx, num_its,
-                                               out_ptr, allocator);
+    armral::ldpc::decode_block<heap_allocator>(n, llr_ptr, bg, z,
+                                               len_filler_bits, out_ptr,
+                                               num_its, options, allocator);
   }
 #endif
 }
@@ -60,16 +63,25 @@ int main(int argc, char **argv) {
     // num_its    - The number of iterations to perform in the decoding
     // num_reps   - The number of times to repeat the decoding so as
     //              to get a stable performance number
-    printf("Usage: %s base_graph z crc_idx num_its num_reps\n", argv[0]);
+    printf(
+        "Usage: %s n base_graph z len_filler_bits num_its options num_reps\n",
+        argv[0]);
     exit(EXIT_FAILURE);
   }
   auto bg = (armral_ldpc_graph_t)(atoi(argv[1]) - 1);
   auto z = (uint32_t)atoi(argv[2]);
   auto crc_idx = (uint32_t)atoi(argv[3]);
+  auto n = (uint32_t)((bg == 0) ? (z * 66) : (z * 50));
+  auto nmessage_bits = bg == LDPC_BASE_GRAPH_1 ? z * 22 : z * 10;
+  auto len_filler_bits = (crc_idx != 0U) ? nmessage_bits - crc_idx - 24 : 0;
   auto num_its = (uint32_t)atoi(argv[4]);
+  auto options =
+      (armral_ldpc_decode_options_t)((crc_idx != 0U)
+                                         ? ARMRAL_LDPC_DEFAULT_OPTIONS
+                                         : ARMRAL_LDPC_CRC_NO);
   auto num_reps = (uint32_t)atoi(argv[5]);
 
-  run_ldpc_decoding_perf(bg, z, crc_idx, num_its, num_reps);
+  run_ldpc_decoding_perf(n, bg, z, len_filler_bits, num_its, options, num_reps);
 
   return EXIT_SUCCESS;
 }
diff --git a/include/armral.h b/include/armral.h
index ae446821c2515db1ab446cfc38dd7a512d4fcfe8..057087a99b692aaba9bb8c909a98795b5ca689df 100644
--- a/include/armral.h
+++ b/include/armral.h
@@ -3403,10 +3403,56 @@ typedef struct {
 } armral_ldpc_base_graph_t;
 
 /**
- * \brief A constant which can be passed to `armral_ldpc_decode_block` when the
- * input code block has no CRC attached.
+ * @enum armral_ldpc_decode_options_t
+ * @brief A constant which can be passed to `armral_ldpc_decode_block` relevant
+ * to the CRC operation type.
+ *
+ * This enumeration defines various options for LDPC decoding,
+ * including different CRC calculation methods, iteration-based CRC checks, and
+ * filler bit handling.
+ *
+ * Enumeration values:
+ * - `ARMRAL_LDPC_CRC_NO`: No CRC calculation.
+ * - `ARMRAL_LDPC_CRC_16`: CRC-16 checksum.
+ * - `ARMRAL_LDPC_CRC_24A`: CRC-24A checksum.
+ * - `ARMRAL_LDPC_CRC_24B`: CRC-24B checksum (default).
+ * - `ARMRAL_LDPC_CRC_EVERY_ITER`: Perform CRC check at every decoding iteration
+ * (default).
+ * - `ARMRAL_LDPC_CRC_END_ITER`: Perform CRC check only at the end of all
+ * decoding iterations.
+ * - `ARMRAL_LDPC_FILLER_BITS_IMPLICIT`: Implicit handling of filler bits.
+ * - `ARMRAL_LDPC_FILLER_BITS_EXPLICIT`: Explicit handling of filler bits
+ * (default).
  */
-#define ARMRAL_LDPC_NO_CRC 0
+typedef enum {
+  ARMRAL_LDPC_CRC_NO = 0, ///< No CRC calculation
+  ARMRAL_LDPC_CRC_16,     ///< CRC-16
+  ARMRAL_LDPC_CRC_24A,    ///< CRC-24A
+  ARMRAL_LDPC_CRC_24B,    ///< CRC-24B (default)
+
+  ARMRAL_LDPC_CRC_EVERY_ITER = (1 << 2), ///< CRC every iteration (default)
+  ARMRAL_LDPC_CRC_END_ITER = (2 << 2),   ///< CRC at end of iterations
+
+  ARMRAL_LDPC_FILLER_BITS_IMPLICIT = (1 << 4), ///< Filler bits implicit
+  ///< Filler bits explicit (default)
+  ARMRAL_LDPC_FILLER_BITS_EXPLICIT = (2 << 4)
+} armral_ldpc_decode_options_t;
+
+/**
+ * @brief Default options for ARMRAL LDPC processing.
+ *
+ * This macro defines the default configuration options for ARMRAL LDPC,
+ * combining the following flags:
+ * - `ARMRAL_LDPC_CRC_24B`: Enables 24-bit CRC checking.
+ * - `ARMRAL_LDPC_CRC_EVERY_ITER`: Performs CRC verification at every decoding
+ * iteration.
+ * - `ARMRAL_LDPC_FILLER_BITS_EXPLICIT`: Explicitly marks filler bits in
+ * decoding.
+ *
+ */
+#define ARMRAL_LDPC_DEFAULT_OPTIONS                                            \
+  (ARMRAL_LDPC_CRC_24B | ARMRAL_LDPC_CRC_EVERY_ITER |                          \
+   ARMRAL_LDPC_FILLER_BITS_EXPLICIT)
 
 /**
  * Uses the identifier of a base graph to get the data structure that describes
@@ -3562,29 +3608,46 @@ uint32_t armral_ldpc_encode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
  * (TS) 38.212. It is possible that there is no CRC data attached to the code
  * block, in which case `ARMRAL_LDPC_NO_CRC` can be passed.
  *
+ * @param[in]  n        The length of `llrs`.
  * @param[in]  llrs     The initial LLRs to use in the decoding. This is
  *                      typically the output after demodulation and rate
  *                      recovery. Supports 8 bit llrs in q1.7.
  * @param[in]  bg       The type of base graph to use for the decoding.
  * @param[in]  z        The lifting size. Valid values of the lifting size are
  *                      described in table 5.3.2-1 in TS 38.212.
- * @param[in]  crc_idx  The index of the bit where the CRC attached to the code
- *                      block begins. If there is no CRC attached, set this to
- *                      `ARMRAL_LDPC_NO_CRC`.
- * @param[in]  num_its  The maximum number of iterations of the LDPC decoder to
- *                      run. The algorithm may terminate after fewer iterations
- *                      if the current candidate codeword passes all the parity
- *                      checks, or if it satisfies the CRC check.
+ * @param[in] len_filler_bits The number of filler bits. As per TS 38.212,
+ *                            section 5.2.2, filler bits insertion is needed to
+ *                            ensure that the code block segments have a valid
+ *                            length and are a multiple of the lifting size.
+ *                            Filler bits are used to calculate CRC internally.
+ *                            This is assumed to be a multiple of 8bits.
  * @param[out] data_out The decoded bits. These are of length `22 * z` for base
  *                      graph 1 and `10 * z` for base graph 2. It is assumed
  *                      that the array `data_out` is able to store this many
  *                      bits.
+ * @param[in]  max_its  The maximum number of iterations of the LDPC decoder to
+ *                      run. The algorithm may terminate after fewer iterations
+ *                      if the current candidate codeword passes all the parity
+ *                      checks, or if it satisfies the CRC check.
+ * @param[in]  options  It is an OR'd result of the below fields,
+ *                      CRC Type:
+ *                          ARMRAL_LDPC_CRC_NO
+ *                          ARMRAL_LDPC_CRC_16
+ *                          ARMRAL_LDPC_CRC_24A
+ *                          ARMRAL_LDPC_CRC_24B (default)
+ *                      CRC Mode:
+ *                          ARMRAL_LDPC_CRC_EVERY_ITER (default)
+ *                          ARMRAL_LDPC_CRC_END_ITER
+ *                      Filler Bits:
+ *                          ARMRAL_LDPC_FILLER_BITS_IMPLICIT
+ *                          ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default).
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_ldpc_decode_block(const int8_t *llrs,
+armral_status armral_ldpc_decode_block(uint32_t n, const int8_t *llrs,
                                        armral_ldpc_graph_t bg, uint32_t z,
-                                       uint32_t crc_idx, uint32_t num_its,
-                                       uint8_t *data_out);
+                                       uint32_t len_filler_bits,
+                                       uint8_t *data_out, uint32_t max_its,
+                                       armral_ldpc_decode_options_t options);
 
 /**
  * Non-allocating variant of \link armral_ldpc_decode_block \endlink.
@@ -3612,31 +3675,46 @@ armral_status armral_ldpc_decode_block(const int8_t *llrs,
  * calling \link armral_ldpc_decode_block_noalloc_buffer_size \endlink
  * with identical inputs.
  *
+ * @param[in]  n        The length of `llrs`.
  * @param[in]  llrs     The initial LLRs to use in the decoding. This is
  *                      typically the output after demodulation and rate
  *                      recovery. Supports 8 bit llrs in q1.7.
  * @param[in]  bg       The type of base graph to use for the decoding.
  * @param[in]  z        The lifting size. Valid values of the lifting size are
  *                      described in table 5.3.2-1 in TS 38.212.
- * @param[in]  crc_idx  The index of the bit where the CRC attached to the code
- *                      block begins. If there is no CRC attached, set this to
- *                      `ARMRAL_LDPC_NO_CRC`.
- * @param[in]  num_its  The maximum number of iterations of the LDPC decoder to
- *                      run. The algorithm may terminate after fewer iterations
- *                      if the current candidate codeword passes all the parity
- *                      checks, or if it satisfies the CRC check.
+ * @param[in] len_filler_bits The number of filler bits. As per TS 38.212,
+ *                            section 5.2.2, filler bits insertion is needed to
+ *                            ensure that the code block segments have a valid
+ *                            length and are a multiple of the lifting size.
+ *                            Filler bits are used to calculate CRC internally.
+ *                            This is assumed to be a multiple of 8bits.
  * @param[out] data_out The decoded bits. These are of length `22 * z` for base
  *                      graph 1 and `10 * z` for base graph 2. It is assumed
  *                      that the array `data_out` is able to store this many
  *                      bits.
+ * @param[in]  max_its  The maximum number of iterations of the LDPC decoder to
+ *                      run. The algorithm may terminate after fewer iterations
+ *                      if the current candidate codeword passes all the parity
+ *                      checks, or if it satisfies the CRC check.
+ * @param[in]  options  It is an OR'd result of the below fields,
+ *                      CRC Type:
+ *                          ARMRAL_LDPC_CRC_NO
+ *                          ARMRAL_LDPC_CRC_16
+ *                          ARMRAL_LDPC_CRC_24A
+ *                          ARMRAL_LDPC_CRC_24B (default)
+ *                      CRC Mode:
+ *                          ARMRAL_LDPC_CRC_EVERY_ITER (default)
+ *                          ARMRAL_LDPC_CRC_END_ITER
+ *                      Filler Bits:
+ *                          ARMRAL_LDPC_FILLER_BITS_IMPLICIT
+ *                          ARMRAL_LDPC_FILLER_BITS_EXPLICIT (default).
  * @param[in]  buffer   Workspace buffer to be used internally.
  * @return     An `armral_status` value that indicates success or failure.
  */
-armral_status armral_ldpc_decode_block_noalloc(const int8_t *llrs,
-                                               armral_ldpc_graph_t bg,
-                                               uint32_t z, uint32_t crc_idx,
-                                               uint32_t num_its,
-                                               uint8_t *data_out, void *buffer);
+armral_status armral_ldpc_decode_block_noalloc(
+    uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+    uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its,
+    armral_ldpc_decode_options_t options, void *buffer);
 
 /**
  * Calculates the required buffer size in bytes needed to perform LDPC decoding
@@ -3645,10 +3723,7 @@ armral_status armral_ldpc_decode_block_noalloc(const int8_t *llrs,
  * @param[in]  bg       The type of base graph to use for the decoding.
  * @param[in]  z        The lifting size. Valid values of the lifting size are
  *                      described in table 5.3.2-1 in TS 38.212.
- * @param[in]  crc_idx  The index of the bit where the CRC attached to the code
- *                      block begins. If there is no CRC attached, set this to
- *                      `ARMRAL_LDPC_NO_CRC`.
- * @param[in]  num_its  The maximum number of iterations of the LDPC decoder to
+ * @param[in]  max_its  The maximum number of iterations of the LDPC decoder to
  *                      run. The algorithm may terminate after fewer iterations
  *                      if the current candidate codeword passes all the parity
  *                      checks, or if it satisfies the CRC check.
@@ -3656,8 +3731,7 @@ armral_status armral_ldpc_decode_block_noalloc(const int8_t *llrs,
  */
 uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
                                                       uint32_t z,
-                                                      uint32_t crc_idx,
-                                                      uint32_t num_its);
+                                                      uint32_t max_its);
 
 /**
  * Matches the rate of the code block encoded with LDPC code to the rate of the
diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp
index 06331567aad3451bbf0597c3fb7d3b07e74c9384..29a7729103bbe6f4928f094a3a6f08d27e84a020 100644
--- a/simulation/ldpc_awgn/ldpc_awgn.cpp
+++ b/simulation/ldpc_awgn/ldpc_awgn.cpp
@@ -223,8 +223,9 @@ int run_check(armral::utils::random_state *state, uint32_t z,
                             data->data_recovered);
 
   // Run LDPC decoding for a single block
-  armral_ldpc_decode_block(data->data_recovered, bg, z, ARMRAL_LDPC_NO_CRC, 10,
-                           data->data_decoded);
+  armral_ldpc_decode_block(data->len_encoded, data->data_recovered, bg, z,
+                           data->len_filler_bits, data->data_decoded, 10,
+                           ARMRAL_LDPC_CRC_NO);
 
   // To make it easier to compare the values, convert the bit array to a byte
   // array
diff --git a/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp
index c00af85d01272c9cc9cc2e705885d2b596d949c4..a7e1d7ea1a58569bc41e541cafb6770037bde826 100644
--- a/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp
+++ b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp
@@ -1296,7 +1296,7 @@ bool hard_decision(int16_t *ptr_l, uint8_t *crc_buff, uint8_t *ptr_data,
                    uint32_t k, uint32_t crc_flag) {
 
   uint32_t num_lanes = get_num_lanes();
-  uint32_t k_prime = k + 24;
+  uint32_t k_prime = k;
   uint32_t full_vec = (k_prime) / num_lanes;
   uint32_t tail_cnt = (k_prime) & (num_lanes - 1);
   uint8_t *data = (uint8_t *)crc_buff;
@@ -1415,6 +1415,8 @@ bool hard_decision(int16_t *ptr_l, uint8_t *crc_buff, uint8_t *ptr_data,
       for (uint32_t i = 0; i < (k_prime >> 3); i++) {
         ptr_data[i] = crc_buff[i + pad_bytes];
       }
+    } else {
+      memcpy(ptr_data, crc_buff, (k + 7) >> 3);
     }
   } else {
     memcpy(ptr_data, crc_buff, (k + 7) >> 3);
@@ -1423,68 +1425,222 @@ bool hard_decision(int16_t *ptr_l, uint8_t *crc_buff, uint8_t *ptr_data,
   return (crc == 0U);
 }
 
-inline void load_ptr_l(int16_t *ptr_l, const int8_t *llrs_ptr,
-                       uint32_t len_in) {
+inline void load_ptr_l(int16_t *ptr_l, const int8_t *llrs_ptr, uint32_t len_in,
+                       uint32_t filler_bits_len, uint32_t k) {
+
+  if (filler_bits_len == 0) {
 #if ARMRAL_ARCH_SVE >= 2
-  svint8_t vec8;
-  svbool_t pg = svptrue_b8();
+    svint8_t vec8;
+    svbool_t pg = svptrue_b8();
 
-  uint32_t num_lanes = get_num_lanes();
-  uint32_t full_blk = len_in / (2 * num_lanes);
-  uint32_t tail_cnt = len_in % (2 * num_lanes);
+    uint32_t num_lanes = get_num_lanes();
+    uint32_t full_blk = len_in / (2 * num_lanes);
+    uint32_t tail_cnt = len_in % (2 * num_lanes);
 
-  for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
-    vec8 = svld1_s8(pg, llrs_ptr);
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      vec8 = svld1_s8(pg, llrs_ptr);
 
-    svint16_t t1 = svmovlb_s16(vec8);
-    svint16_t t2 = svmovlt_s16(vec8);
+      svint16_t t1 = svmovlb_s16(vec8);
+      svint16_t t2 = svmovlt_s16(vec8);
 
-    svint16_t result1 = svzip1_s16(t1, t2);
-    svint16_t result2 = svzip2_s16(t1, t2);
+      svint16_t result1 = svzip1_s16(t1, t2);
+      svint16_t result2 = svzip2_s16(t1, t2);
 
-    svst1_s16(pg, ptr_l, result1);
-    ptr_l += num_lanes;
-    svst1_s16(pg, ptr_l, result2);
-    ptr_l += num_lanes;
-    llrs_ptr += 2 * num_lanes;
-  }
+      svst1_s16(pg, ptr_l, result1);
+      ptr_l += num_lanes;
+      svst1_s16(pg, ptr_l, result2);
+      ptr_l += num_lanes;
+      llrs_ptr += 2 * num_lanes;
+    }
 
-  if (tail_cnt != 0U) {
-    for (uint32_t i = 0; i < tail_cnt; i++) {
-      ptr_l[i] = (int16_t)llrs_ptr[i];
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
     }
-  }
 
 #else
 
-  uint32_t full_blk = len_in / 16;
-  uint32_t tail_cnt = len_in % 16;
-
-  for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
-    int8x16_t vec = vld1q_s8(llrs_ptr);
-    int8x8_t vec_h = vget_high_s8(vec);
-    int16x8_t vec_h_16 = vmovl_s8(vec_h);
-    int8x8_t vec_l = vget_low_s8(vec);
-    int16x8_t vec_l_16 = vmovl_s8(vec_l);
-    vst1q_s16(ptr_l, vec_l_16);
-    ptr_l += 8;
-    vst1q_s16(ptr_l, vec_h_16);
-    llrs_ptr += 16;
-    ptr_l += 8;
-  }
+    uint32_t full_blk = len_in / 16;
+    uint32_t tail_cnt = len_in % 16;
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      int8x16_t vec = vld1q_s8(llrs_ptr);
+      int8x8_t vec_h = vget_high_s8(vec);
+      int16x8_t vec_h_16 = vmovl_s8(vec_h);
+      int8x8_t vec_l = vget_low_s8(vec);
+      int16x8_t vec_l_16 = vmovl_s8(vec_l);
+      vst1q_s16(ptr_l, vec_l_16);
+      ptr_l += 8;
+      vst1q_s16(ptr_l, vec_h_16);
+      llrs_ptr += 16;
+      ptr_l += 8;
+    }
 
-  if (tail_cnt != 0U) {
-    for (uint32_t i = 0; i < tail_cnt; i++) {
-      ptr_l[i] = (int16_t)llrs_ptr[i];
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
+    }
+
+#endif
+  } else {
+#if ARMRAL_ARCH_SVE >= 2
+    svint8_t vec8;
+    svbool_t pg = svptrue_b8();
+    uint32_t num_lanes = get_num_lanes();
+
+    // copy k - filler_bits
+    uint32_t full_blk = (k - filler_bits_len) / (2 * num_lanes);
+    uint32_t tail_cnt = (k - filler_bits_len) % (2 * num_lanes);
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      vec8 = svld1_s8(pg, llrs_ptr);
+
+      svint16_t t1 = svmovlb_s16(vec8);
+      svint16_t t2 = svmovlt_s16(vec8);
+
+      svint16_t result1 = svzip1_s16(t1, t2);
+      svint16_t result2 = svzip2_s16(t1, t2);
+
+      svst1_s16(pg, ptr_l, result1);
+      ptr_l += num_lanes;
+      svst1_s16(pg, ptr_l, result2);
+      ptr_l += num_lanes;
+      llrs_ptr += 2 * num_lanes;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
+      llrs_ptr += tail_cnt;
+      ptr_l += tail_cnt;
+    }
+
+    // set filler bits
+    full_blk = filler_bits_len / (2 * num_lanes);
+    tail_cnt = filler_bits_len % (2 * num_lanes);
+    svint16_t vec_filler_data_16 = svdup_n_s16(127);
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      svst1_s16(pg, ptr_l, vec_filler_data_16);
+      ptr_l += num_lanes;
+      svst1_s16(pg, ptr_l, vec_filler_data_16);
+      ptr_l += num_lanes;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = 127;
+      }
+      ptr_l += tail_cnt;
+    }
+
+    // copy parity bits
+    full_blk = (len_in - (k - filler_bits_len)) / (2 * num_lanes);
+    tail_cnt = (len_in - (k - filler_bits_len)) % (2 * num_lanes);
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      vec8 = svld1_s8(pg, llrs_ptr);
+
+      svint16_t t1 = svmovlb_s16(vec8);
+      svint16_t t2 = svmovlt_s16(vec8);
+
+      svint16_t result1 = svzip1_s16(t1, t2);
+      svint16_t result2 = svzip2_s16(t1, t2);
+
+      svst1_s16(pg, ptr_l, result1);
+      ptr_l += num_lanes;
+      svst1_s16(pg, ptr_l, result2);
+      ptr_l += num_lanes;
+      llrs_ptr += 2 * num_lanes;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
+    }
+
+#else
+
+    // copy k - filler_bits
+    uint32_t full_blk = (k - filler_bits_len) / 16;
+    uint32_t tail_cnt = (k - filler_bits_len) % 16;
+    // int16_t *ptr_l1 = ptr_l;
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      int8x16_t vec = vld1q_s8(llrs_ptr);
+      int8x8_t vec_h = vget_high_s8(vec);
+      int16x8_t vec_h_16 = vmovl_s8(vec_h);
+      int8x8_t vec_l = vget_low_s8(vec);
+      int16x8_t vec_l_16 = vmovl_s8(vec_l);
+      vst1q_s16(ptr_l, vec_l_16);
+      ptr_l += 8;
+      vst1q_s16(ptr_l, vec_h_16);
+      llrs_ptr += 16;
+      ptr_l += 8;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
+      llrs_ptr += tail_cnt;
+      ptr_l += tail_cnt;
+    }
+
+    // set filler bits
+    full_blk = (filler_bits_len) / 16;
+    tail_cnt = (filler_bits_len) % 16;
+    int16x8_t vec_filler_data_16 = {127, 127, 127, 127, 127, 127, 127, 127};
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      vst1q_s16(ptr_l, vec_filler_data_16);
+      ptr_l += 8;
+      vst1q_s16(ptr_l, vec_filler_data_16);
+      ptr_l += 8;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = 127;
+      }
+      ptr_l += tail_cnt;
+    }
+
+    // copy parity bits
+    full_blk = (len_in - (k - filler_bits_len)) / 16;
+    tail_cnt = (len_in - (k - filler_bits_len)) % 16;
+
+    for (uint32_t num_block = 0; num_block < full_blk; num_block++) {
+      int8x16_t vec = vld1q_s8(llrs_ptr);
+      int8x8_t vec_h = vget_high_s8(vec);
+      int16x8_t vec_h_16 = vmovl_s8(vec_h);
+      int8x8_t vec_l = vget_low_s8(vec);
+      int16x8_t vec_l_16 = vmovl_s8(vec_l);
+      vst1q_s16(ptr_l, vec_l_16);
+      ptr_l += 8;
+      vst1q_s16(ptr_l, vec_h_16);
+      llrs_ptr += 16;
+      ptr_l += 8;
+    }
+
+    if (tail_cnt != 0U) {
+      for (uint32_t i = 0; i < tail_cnt; i++) {
+        ptr_l[i] = (int16_t)llrs_ptr[i];
+      }
     }
-  }
 
 #endif
+  }
 }
 
 template<typename Allocator>
-bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
-                  uint32_t crc_idx, uint32_t num_its, uint8_t *data_out,
+bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg,
+                  uint32_t z, uint32_t len_filler_bits, uint8_t *data_out,
+                  uint32_t max_its, armral_ldpc_decode_options_t options,
                   Allocator &allocator) {
 
   bool crc_passed = false;
@@ -1525,6 +1681,11 @@ bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
   }
 
   uint32_t r_index = 0;
+  uint32_t no_of_rows = graph->nrows;
+
+  if ((n + len_filler_bits) < z * graph->ncodeword_bits) {
+    no_of_rows = (n / z) - graph->nmessage_bits;
+  }
 
   // initialization with channel LLRs. 16-bit buffer "l" will be used for
   // in-place calculations
@@ -1535,13 +1696,16 @@ bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
   memset(ptr_l, 0, sizeof(int16_t) * 2 * z);
   ptr_l = ptr_l + 2 * z;
 
-  load_ptr_l(ptr_l, llrs_ptr, graph->ncodeword_bits * z);
+  load_ptr_l(ptr_l, llrs_ptr, n,
+             ((options & (1 << 4)) == ARMRAL_LDPC_FILLER_BITS_IMPLICIT) *
+                 len_filler_bits,
+             (bg == 0) ? (z * 20) : (z * 8));
 
   uint32_t full_blk = z_len / num_lanes;
 
-  for (uint32_t it = 0; it < num_its; ++it) {
+  for (uint32_t it = 0; it < max_its; ++it) {
     r_index = 0;
-    for (uint32_t layer = 0; layer < graph->nrows; layer++) {
+    for (uint32_t layer = 0; layer < no_of_rows; layer++) {
 
       // reset the sign buffer
       memset(row_sign_array.get(), 0, sizeof(int16_t) * z);
@@ -1582,24 +1746,31 @@ bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
                      sign_scratch.get(), &r_index);
     }
 
-    // early exit if crc Passes
-    if (crc_idx) {
-      if (it < (num_its - 1)) {
-        crc_passed =
-            hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true);
-        if (crc_passed) {
-          return crc_passed;
-        }
+    // decision and CRC at the iteration
+    if ((options & (1 << 2)) == ARMRAL_LDPC_CRC_EVERY_ITER) {
+      crc_passed =
+          hard_decision(l.get(), crc_buff.get(), &data_out[0],
+                        graph->nmessage_bits * z - len_filler_bits, true);
+      if (crc_passed) {
+        break;
       }
     }
   }
 
-  if (crc_idx == ARMRAL_LDPC_NO_CRC) { // do only decisions
-    crc_passed = hard_decision(l.get(), crc_buff.get(), &data_out[0],
-                               graph->nmessage_bits * z, false);
-  } else {
+  // do decision and CRC
+  if ((options & (2 << 2)) == ARMRAL_LDPC_CRC_END_ITER) {
+    // ignore filler bits
+    crc_passed =
+        hard_decision(l.get(), crc_buff.get(), &data_out[0],
+                      graph->nmessage_bits * z - len_filler_bits, true);
+  }
+
+  // do only decisions
+  if ((options & 3) == ARMRAL_LDPC_CRC_NO) {
+    // ignore filler bits
     crc_passed =
-        hard_decision(l.get(), crc_buff.get(), &data_out[0], crc_idx, true);
+        hard_decision(l.get(), crc_buff.get(), &data_out[0],
+                      graph->nmessage_bits * z - len_filler_bits, false);
   }
   return crc_passed;
 }
@@ -1607,41 +1778,43 @@ bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
 } // namespace armral::ldpc
 
 template bool armral::ldpc::decode_block<heap_allocator>(
-    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
-    uint32_t num_its, uint8_t *data_out, heap_allocator &);
+    uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+    uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its,
+    armral_ldpc_decode_options_t options, heap_allocator &);
 
 template bool armral::ldpc::decode_block<buffer_bump_allocator>(
-    const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z, uint32_t crc_idx,
-    uint32_t num_its, uint8_t *data_out, buffer_bump_allocator &);
+    uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+    uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its,
+    armral_ldpc_decode_options_t options, buffer_bump_allocator &);
 
-armral_status armral_ldpc_decode_block(const int8_t *llrs,
+armral_status armral_ldpc_decode_block(uint32_t n, const int8_t *llrs,
                                        armral_ldpc_graph_t bg, uint32_t z,
-                                       uint32_t crc_idx, uint32_t num_its,
-                                       uint8_t *data_out) {
+                                       uint32_t len_filler_bits,
+                                       uint8_t *data_out, uint32_t max_its,
+                                       armral_ldpc_decode_options_t options) {
 
   heap_allocator allocator{};
-  bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its,
-                                           data_out, allocator);
+  bool result = armral::ldpc::decode_block(
+      n, llrs, bg, z, len_filler_bits, data_out, max_its, options, allocator);
   return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL;
 }
 
-armral_status
-armral_ldpc_decode_block_noalloc(const int8_t *llrs, armral_ldpc_graph_t bg,
-                                 uint32_t z, uint32_t crc_idx, uint32_t num_its,
-                                 uint8_t *data_out, void *buffer) {
+armral_status armral_ldpc_decode_block_noalloc(
+    uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+    uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its,
+    armral_ldpc_decode_options_t options, void *buffer) {
 
   buffer_bump_allocator allocator{buffer};
-  bool result = armral::ldpc::decode_block(llrs, bg, z, crc_idx, num_its,
-                                           data_out, allocator);
+  bool result = armral::ldpc::decode_block(
+      n, llrs, bg, z, len_filler_bits, data_out, max_its, options, allocator);
   return (result) ? ARMRAL_SUCCESS : ARMRAL_RESULT_FAIL;
 }
 
 uint32_t armral_ldpc_decode_block_noalloc_buffer_size(armral_ldpc_graph_t bg,
                                                       uint32_t z,
-                                                      uint32_t crc_idx,
-                                                      uint32_t num_its) {
+                                                      uint32_t max_its) {
   counting_allocator allocator{};
-  armral::ldpc::decode_block(nullptr, bg, z, crc_idx, num_its, nullptr,
-                             allocator);
+  armral::ldpc::decode_block(0, nullptr, bg, z, 0, nullptr, max_its,
+                             ARMRAL_LDPC_CRC_NO, allocator);
   return allocator.required_bytes();
 }
diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp
index b13f508b0ce5f2692a462c3d3a9972d3be00812e..aba68bae92251fb62c60541356e874a787364a58 100644
--- a/src/UpperPHY/LDPC/ldpc_coding.hpp
+++ b/src/UpperPHY/LDPC/ldpc_coding.hpp
@@ -15,8 +15,9 @@ constexpr uint32_t num_lifting_sets = 8;
 uint32_t get_lifting_index(uint32_t lifting_size);
 
 template<typename Allocator>
-bool decode_block(const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
-                  uint32_t crc_idx, uint32_t num_its, uint8_t *data_out,
+bool decode_block(uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg,
+                  uint32_t z, uint32_t len_filler_bits, uint8_t *data_out,
+                  uint32_t max_its, armral_ldpc_decode_options_t options,
                   Allocator &allocator);
 
 } // namespace armral::ldpc
diff --git a/test/UpperPHY/LDPC/Decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp
index a722b4a2df2dcff725ede3de044aac73a75e2a2c..0c494a09811ec13f3c31b740b718b4018c968f2d 100644
--- a/test/UpperPHY/LDPC/Decoding/main.cpp
+++ b/test/UpperPHY/LDPC/Decoding/main.cpp
@@ -100,32 +100,57 @@ bool check_decoded_message(uint32_t len, const uint8_t *orig,
 
 template<typename LDPCDecodingFunction>
 bool run_ldpc_decoding_test(uint32_t its, uint32_t z, armral_ldpc_graph_t bg,
-                            uint32_t crc_idx,
+                            armral_ldpc_decode_options_t options,
+                            uint32_t len_filler_bits,
                             LDPCDecodingFunction ldpc_decoding_under_test) {
 
   bool passed = true;
   const auto *graph = armral_ldpc_get_base_graph(bg);
+  uint32_t n;
 
   // Allocate a random input to be encoded
-  uint32_t len_in = z * graph->nmessage_bits;
+  uint32_t len_in = z * graph->nmessage_bits; // z * 22 or z * 10
+  uint32_t buf_len_in = z * graph->nmessage_bits;
+
+  if (len_filler_bits) {
+    len_in -= len_filler_bits;
+  }
+
   armral::utils::int_random<uint8_t> random;
-  auto to_encode = random.vector((len_in + 7) / 8);
+  auto to_encode = random.vector((buf_len_in + 7) / 8);
 
   // If we are doing CRC checking, then we need to attach CRC bits to the input
-  if (crc_idx != ARMRAL_LDPC_NO_CRC) {
-    auto info_to_encode = random.vector((len_in + 7) / 8);
-    len_in = crc_idx + 24;
-    ldpc_crc_attachment(info_to_encode.data(), len_in, z * graph->nmessage_bits,
+  if ((options & 3) != ARMRAL_LDPC_CRC_NO) {
+    auto info_to_encode = random.vector((buf_len_in + 7) / 8);
+    ldpc_crc_attachment(info_to_encode.data(), len_in, len_in - 24,
                         to_encode.data());
+    if (len_filler_bits) {
+      memset(&to_encode.data()[(len_in + 7) >> 3], 0,
+             sizeof(uint8_t) * ((len_filler_bits + 7) >> 3));
+    }
   }
 
   uint32_t encoded_len = z * graph->ncodeword_bits;
   auto encoded = random.vector((encoded_len + 7) / 8);
-  uint32_t len_filler_bits = 0;
+
   // Encode the data
   armral_ldpc_encode_block(to_encode.data(), bg, z, len_filler_bits,
                            encoded.data());
 
+  // Simulate filler bits removal to create test data for
+  // 'ARMRAL_LDPC_FILLER_BITS_IMPLICIT'
+  if ((options & (1 << 4)) == ARMRAL_LDPC_FILLER_BITS_IMPLICIT) {
+    uint32_t len_s_f_bytes = (len_in - 2 * z + len_filler_bits) >> 3;
+    uint32_t len_s_bytes = (len_in - 2 * z) >> 3;
+    uint32_t len_p_bytes =
+        (encoded_len - (len_in - 2 * z + len_filler_bits)) >> 3;
+
+    uint8_t *buf_1 = encoded.data();
+    memcpy((void *)&buf_1[len_s_bytes], &buf_1[len_s_f_bytes], len_p_bytes);
+
+    encoded_len -= len_filler_bits;
+  }
+
   // run modulation
   armral_modulation_type mod_type = ARMRAL_MOD_16QAM;
   int mod_num_symbols = (encoded_len + 3) / 4;
@@ -142,10 +167,20 @@ bool run_ldpc_decoding_test(uint32_t its, uint32_t z, armral_ldpc_graph_t bg,
                       data_demod_soft.data());
 
   auto decoded = random.vector((len_in + 7) / 8);
-  if (ldpc_decoding_under_test(data_demod_soft.data(), bg, z, crc_idx, its,
-                               decoded.data()) != ARMRAL_SUCCESS) {
+
+  if ((z == 56) || (z == 144)) {
+    // e < n
+    n = (encoded_len / 2);
+  } else {
+    n = encoded_len;
+  }
+
+  if (ldpc_decoding_under_test(n, data_demod_soft.data(), bg, z,
+                               len_filler_bits, decoded.data(), its,
+                               options) != ARMRAL_SUCCESS) {
     return false;
   }
+
   auto decoded_bytes = armral::bits_to_bytes(len_in, decoded.data());
 
   // Also check that the decoded message is equal to the original message
@@ -164,19 +199,17 @@ bool run_all_tests(char const *name,
   std::array bgs{LDPC_BASE_GRAPH_1, LDPC_BASE_GRAPH_2};
   std::array num_its{1, 2, 5, 10};
   std::array zs{2, 6, 13, 20, 30, 56, 144, 208, 224, 256, 320, 384};
-  // Crc-index is zero based indexing
-  std::array crc_idx_1{4225, 4553, 5257, 6313, 7721};
-  std::array crc_idx_2{1921, 2057, 2377, 2857, 3497};
   for (auto bg : bgs) {
-    const auto &crc_ids = (bg == 0) ? crc_idx_1 : crc_idx_2;
     for (uint32_t i = 0; i < zs.size(); i++) {
       auto z = zs[i];
-      assert(z < 208 || i >= 7);
-      auto crc_idx = (z >= 208) ? (crc_ids[i - 7] - 1) : ARMRAL_LDPC_NO_CRC;
+      armral_ldpc_decode_options_t options = armral_ldpc_decode_options_t(
+          (z >= 208) ? ARMRAL_LDPC_DEFAULT_OPTIONS : 0);
+      uint32_t len_filler_bits = (z >= 208) ? 32 : 0;
       for (auto its : num_its) {
-        printf("[%s] z = %d, crc_idx = %u, its = %d\n", name, z, crc_idx, its);
-        auto check = run_ldpc_decoding_test(its, z, bg, crc_idx,
-                                            ldpc_decoding_under_test);
+        printf("[%s] z = %d, its = %d len_filler_bits = %d\n", name, z, its,
+               len_filler_bits);
+        auto check = run_ldpc_decoding_test(
+            its, z, bg, options, len_filler_bits, ldpc_decoding_under_test);
         if (!check) {
           // GCOVR_EXCL_START
           printf("Decoding failed\n");
@@ -196,13 +229,15 @@ int main(int argc, char **argv) {
 
   passed &= run_all_tests(
       "LDPCDecodingNoAlloc",
-      [](const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
-         uint32_t crc_idx, uint32_t num_its, uint8_t *data_out) {
-        auto buffer_size = armral_ldpc_decode_block_noalloc_buffer_size(
-            bg, z, crc_idx, num_its);
+      [](uint32_t n, const int8_t *llrs, armral_ldpc_graph_t bg, uint32_t z,
+         uint32_t len_filler_bits, uint8_t *data_out, uint32_t max_its,
+         armral_ldpc_decode_options_t options) {
+        auto buffer_size =
+            armral_ldpc_decode_block_noalloc_buffer_size(bg, z, max_its);
         std::vector<uint8_t> buffer(buffer_size);
-        return armral_ldpc_decode_block_noalloc(llrs, bg, z, crc_idx, num_its,
-                                                data_out, buffer.data());
+        return armral_ldpc_decode_block_noalloc(n, llrs, bg, z, len_filler_bits,
+                                                data_out, max_its, options,
+                                                buffer.data());
       });
 
   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);