diff --git a/test/common/test_suite.hpp b/test/common/test_suite.hpp
index cc79103785148c5da7493334c21fe17d1a7c6c8f..1bc851774da8c695ae3574f71fe63d0054ce4c24 100644
--- a/test/common/test_suite.hpp
+++ b/test/common/test_suite.hpp
@@ -76,6 +76,22 @@ struct MatMulShape {
     size_t m{};  ///< LHS height.
     size_t n{};  ///< RHS width.
     size_t k{};  ///< LHS width and RHS height.
+private:
+    friend bool operator==(const MatMulShape& lhs, const MatMulShape& rhs) {
+        return                 //
+            lhs.m == rhs.m &&  //
+            lhs.n == rhs.n &&  //
+            lhs.k == rhs.k;
+    }
+};
+
+struct HashMatMulShape {
+    size_t operator()(const kai::test::MatMulShape& shape) const {
+        return                                     //
+            (std::hash<size_t>{}(shape.m) << 0) ^  //
+            (std::hash<size_t>{}(shape.n) << 1) ^  //
+            (std::hash<size_t>{}(shape.k) << 2);
+    }
 };
 
 /// Matrix multiplication test information.
diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp
index f7888e291ae6cf1c83111c5e77a25bad76b2eee0..e735b23f553730a61f30e02d86399a3d3b5b5e02 100644
--- a/test/reference/matmul.cpp
+++ b/test/reference/matmul.cpp
@@ -189,7 +189,8 @@ template <
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
 std::vector<uint8_t> indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
@@ -199,27 +200,32 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
 
     std::vector<uint8_t> dst(m * n * sizeof(DstData));
 
-    for (size_t y = 0; y < m; ++y) {
-        for (size_t x = 0; x < n; ++x) {
+    for (size_t i_m = 0; i_m < m; ++i_m) {
+        for (size_t i_n = 0; i_n < n; ++i_n) {
             DstData acc = 0;
 
             for (size_t i_k_chunk = 0; i_k_chunk < k_chunk_count; ++i_k_chunk) {
-                const void* lhs_data = *(lhs_ptrs + (y * k_chunk_count + i_k_chunk));
+                // Calculate the K chunk pointer. Apply offset if this is not padding
+                const size_t k_chunk_idx = i_m * k_chunk_count + i_k_chunk;
+                const void* k_chunk_ptr = lhs_ptrs[k_chunk_idx];
+                if (k_chunk_ptr != lhs_padding) {
+                    k_chunk_ptr = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(k_chunk_ptr) + lhs_offset);
+                }
 
                 for (size_t i_k_chunk_len = 0; i_k_chunk_len < k_chunk_length; ++i_k_chunk_len) {
                     const size_t i = i_k_chunk * k_chunk_length + i_k_chunk_len;
 
                     const auto lhs_data_index = i_k_chunk_len;
-                    const auto lhs_quant_index = (y / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width;
-                    const auto lhs_value = read_array<LhsData>(lhs_data, lhs_data_index);
+                    const auto lhs_quant_index = (i_m / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width;
+                    const auto lhs_value = read_array<LhsData>(k_chunk_ptr, lhs_data_index);
                     const auto lhs_scale = lhs_scales != nullptr ? read_array<LhsScale>(lhs_scales, lhs_quant_index)
                                                                  : static_cast<LhsScale>(1);
                     const auto lhs_zero_point = lhs_zero_points != nullptr
                         ? read_array<LhsZeroPoint>(lhs_zero_points, lhs_quant_index)
                         : static_cast<LhsZeroPoint>(0);
 
-                    const auto rhs_data_index = x * (k_chunk_count * k_chunk_length) + i;
-                    const auto rhs_quant_index = (x / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width;
+                    const auto rhs_data_index = i_n * (k_chunk_count * k_chunk_length) + i;
+                    const auto rhs_quant_index = (i_n / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width;
                     const auto rhs_value = read_array<RhsData>(rhs_data, rhs_data_index);
                     const auto rhs_scale = rhs_scales != nullptr ? read_array<RhsScale>(rhs_scales, rhs_quant_index)
                                                                  : static_cast<RhsScale>(1);
@@ -235,19 +241,19 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
             }
 
             if (bias_data != nullptr) {
-                const auto bias_value = read_array<BiasData>(bias_data, x);
+                const auto bias_value = read_array<BiasData>(bias_data, i_n);
                 const auto bias_scale = bias_scales != nullptr
-                    ? read_array<BiasScale>(bias_scales, x / bias_quant_width)
+                    ? read_array<BiasScale>(bias_scales, i_n / bias_quant_width)
                     : static_cast<BiasScale>(1);
                 const auto bias_zero_point = bias_zero_points != nullptr
-                    ? read_array<BiasZeroPoint>(bias_zero_points, x / bias_quant_width)
+                    ? read_array<BiasZeroPoint>(bias_zero_points, i_n / bias_quant_width)
                     : static_cast<BiasZeroPoint>(0);
 
                 acc += (static_cast<DstData>(bias_value) - static_cast<DstData>(bias_zero_point)) *
                     static_cast<DstData>(bias_scale);
             }
 
-            write_array<DstData>(dst.data(), y * n + x, acc);
+            write_array<DstData>(dst.data(), i_m * n + i_n, acc);
         }
     }
 
@@ -331,7 +337,8 @@ matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, f
 template std::vector<uint8_t>
 indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp
index 913de1f84ba8d1196afae424f3873135903ca6e5..8d83e98c2455cb8e7c972d097586e1eefd55b5aa 100644
--- a/test/reference/matmul.hpp
+++ b/test/reference/matmul.hpp
@@ -122,7 +122,12 @@ std::vector<uint8_t> matmul_clamp_nt_t(
 /// @param[in] m The LHS and output height.
 /// @param[in] n The RHS height and output width.
 /// @param[in] k The LHS and RHS width.
+/// @param[in] k_chunk_count Number of K chunk pointers per row in lhs_idata matrix
+/// @param[in] k_chunk_length Lenght of each K chunk pointed to in lhs_idata matrix
 /// @param[in] lhs_data The LHS data matrix.
+/// @param[in] lhs_idata The indirect LHS data matrix.
+/// @param[in] lhs_offset The indirection LHS data matrix offset, applied to non-padding pointers
+/// @parma[in] lhs_padding The indirection LHS padding chunk pointer
 /// @param[in] lhs_scales The LHS quantization scales matrix.
 /// @param[in] lhs_zero_points The LHS quantization zero points matrix.
 /// @param[in] lhs_quant_width The LHS quantization block width.
@@ -161,7 +166,8 @@ template <
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
 std::vector<uint8_t> indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
index d71193a913a79164157516ddd9c0267abfb7a4f5..e7ded92ce75bb01f50de2e1abf6fd2940b7f722b 100644
--- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
@@ -16,6 +16,7 @@
 #include <string>
 #include <string_view>
 #include <tuple>
+#include <unordered_map>
 #include <vector>
 
 #include "kai/kai_common.h"
@@ -336,6 +337,8 @@ struct TestReference {
     Buffer lhs_qai8_zero_points;
     IndirectionBuffer lhs_qai8_indirect;
     Buffer lhs_qai8_indirect_packed;
+    Buffer lhs_qai8_indirect_padding;
+    size_t lhs_qai8_indirect_offset;
 
     Buffer rhs_qsi8;
     Buffer rhs_scales;
@@ -380,30 +383,59 @@ static const kai_imatmul_clamp_qai8_qai8p_qsi8cxp_ukernel
         .run_matmul = kai_run_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa,
 };
 
-// M, N, K, k_chunk_length, pack.m, pack.n, pack.k
-using TestDataId = std::tuple<size_t, size_t, size_t, size_t, size_t, size_t, size_t>;
+static constexpr int8_t padding_value = 0;
+
+// Functionality for hashing generated test data.
+// This is particularly useful for portion testing
+// which reuses the exact same data for all portions
+struct TestDataId {
+    MatMulShape shape;
+    MatMulShape shape_pack;
+    size_t chunk_len;
+    bool pad_testing;
+
+private:
+    friend bool operator==(const TestDataId& lhs, const TestDataId& rhs) {
+        return                                   //
+            lhs.shape == rhs.shape &&            //
+            lhs.shape_pack == rhs.shape_pack &&  //
+            lhs.chunk_len == rhs.chunk_len &&    //
+            lhs.pad_testing == rhs.pad_testing;
+    }
+};
+
+struct HashTestDataId {
+    size_t operator()(const TestDataId& id) const {
+        return                                          //
+            (HashMatMulShape{}(id.shape) << 0) ^        //
+            (HashMatMulShape{}(id.shape_pack) << 1) ^   //
+            (std::hash<size_t>{}(id.chunk_len) << 2) ^  //
+            (std::hash<bool>{}(id.pad_testing) << 3);
+    }
+};
+
 // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
-static std::map<TestDataId, TestReference> g_data;
+static std::unordered_map<TestDataId, TestReference, HashTestDataId> g_data;
 // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 /// Generate test reference data
-static const TestReference& get_test_reference(
-    const MatMulShape& shape, const MatMulShape& pack_shape, size_t k_chunk_len) {
+static const TestReference& get_test_reference(const TestDataId& test_data_id) {
     // ============================================================
     // Generates input and reference output data
     // ============================================================
 
     // Attempt to find test data in cache
-    const TestDataId data_id{shape.m, shape.n, shape.k, k_chunk_len, pack_shape.m, pack_shape.n, pack_shape.k};
-    const auto data_it = g_data.find(data_id);
+    const auto data_it = g_data.find(test_data_id);
     if (data_it != g_data.end()) {
         return data_it->second;
     }
 
+    const auto& [shape, pack_shape, k_chunk_len, pad_testing] = test_data_id;
+
     // Generates the input data in floating-point.
-    const auto lhs_f32 = fill_random<float>(shape.m * shape.k, seed);
-    const auto rhs_f32 = fill_random<float>(shape.k * shape.n, seed);
-    const auto bias_f32 = fill_random<float>(shape.n, seed);
+    Buffer lhs_f32 = fill_random<float>(shape.m * shape.k, seed);
+    const Buffer rhs_f32 = fill_random<float>(shape.k * shape.n, seed);
+    const Buffer bias_f32 = fill_random<float>(shape.n, seed);
 
     // Quantizes the input data.
     //   * LHS: 8-bit asymmetric per-matrix quantization.
@@ -417,18 +449,26 @@ static const TestReference& get_test_reference(
     const auto lhs_scale = read_array<float>(lhs_qai8_scales.data(), 0);
     const auto lhs_zero_point = read_array<int32_t>(lhs_qai8_zero_points.data(), 0);
 
-    IndirectionBuffer lhs_qai8_indirect;
-
     const size_t k_chunk_count = shape.k / k_chunk_len;
     assert(k_chunk_count * k_chunk_len == shape.k);
 
     // Setup an indirection buffer, where each "row" contains `k_chunk_count`
     // pointers to chunks of length `k_chunk_len` in the input_buffer
+    IndirectionBuffer lhs_qai8_indirect(shape.m * k_chunk_count);
+    Buffer lhs_padding(k_chunk_len, padding_value);
     for (size_t m_i = 0; m_i < shape.m; ++m_i) {
         for (size_t k_chunk_idx = 0; k_chunk_idx < k_chunk_count; ++k_chunk_idx) {
-            lhs_qai8_indirect.push_back(&lhs_qai8.at(m_i * shape.k + k_chunk_idx * k_chunk_len));
+            const size_t idx = m_i * k_chunk_count + k_chunk_idx;
+            if (pad_testing and m_i == 0) {
+                // Push padding pointers for first row
+                lhs_qai8_indirect[idx] = lhs_padding.data();
+            } else {
+                uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len;
+                lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(offset);
+            }
         }
     }
+    const auto indirection_base = reinterpret_cast<uintptr_t>(lhs_qai8.data());
 
     // Reorder indirection pointers to layout the packing kernel expectes
     Buffer lhs_qai8_indirect_packed = reorder_block<const void*>(
@@ -449,11 +489,12 @@ static const TestReference& get_test_reference(
         quantize_symmetric_per_block<float, int32_t, float>(bias_f32.data(), bias_scales.data(), shape.n, 1, 1);
 
     // Runs the reference implementation of matmul to produce floating-point result.
+    const void* const* lhs_iptr = reinterpret_cast<const void* const*>(lhs_qai8_indirect.data());
     const auto ref_dst_f32 =
         indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
-            shape.m, shape.n, k_chunk_count, k_chunk_len,  // matmul shape
-            reinterpret_cast<const void* const*>(lhs_qai8_indirect.data()), &lhs_scale,
-            &lhs_zero_point,                                 // LHS, scaling factor and zero point
+            shape.m, shape.n, k_chunk_count, k_chunk_len,    // matmul shape
+            lhs_iptr, indirection_base, lhs_padding.data(),  // LHS indirection, offset and padding
+            &lhs_scale, &lhs_zero_point,                     // LHS, scaling factor and zero point
             shape.m, shape.k,                                // LHS quantization window shape
             rhs_qsi8_t.data(), rhs_scales.data(), nullptr,   // RHS scaling factors
             1, shape.k,                                      // RHS quantization window shape
@@ -505,7 +546,7 @@ static const TestReference& get_test_reference(
         rhs_qsi8_t.data(), rhs_scales.data(), lhs_scale, dst_scale, bias_qsi32.data(), lhs_zero_point, shape.n, shape.k,
         pack_shape.n, pack_shape.k);
 
-    const TestReference& reference = g_data[data_id] = {
+    const TestReference& reference = g_data[test_data_id] = {
         .clamp = {.min = dst_qai8_clamp_min, .max = dst_qai8_clamp_max},
 
         .qa_lhs = {.scale = lhs_scale, .zero_point = lhs_zero_point},
@@ -516,6 +557,8 @@ static const TestReference& get_test_reference(
         .lhs_qai8_zero_points = std::move(lhs_qai8_zero_points),
         .lhs_qai8_indirect = std::move(lhs_qai8_indirect),
         .lhs_qai8_indirect_packed = std::move(lhs_qai8_indirect_packed),
+        .lhs_qai8_indirect_padding = std::move(lhs_padding),
+        .lhs_qai8_indirect_offset = indirection_base,
 
         .rhs_qsi8 = std::move(rhs_qsi8),
         .rhs_scales = std::move(rhs_scales),
@@ -715,7 +758,8 @@ TEST_P(MatMulQuantizedTest, EndToEnd) {
         GTEST_SKIP() << "CPU features are not supported by current CPU";
     }
 
-    TestReference reference = get_test_reference(shape, variant.acc_pack, 1);
+    TestDataId test_data_id{shape, variant.acc_pack, shape.k, false};
+    const TestReference& reference = get_test_reference(test_data_id);
 
     // Check scheduling parameters
     const auto imp_mr = variant.matmul.get_mr();
@@ -762,13 +806,11 @@ static Buffer lhs_pack(
     const size_t input_offset = portion.start_row() * k_chunk.count;
     const size_t dst_offset = variant.get_packed_lhs_offset(portion.start_row(), k_chunk.count, k_chunk.length);
 
-    // TODO: `lhs_offset` is currently not being excercized!
-    // TODO: Ensure that `zero` pointers are tested
     variant.pack(
         portion.height(), k_chunk.count, k_chunk.length,  // Dimensions
         indirection_pointer + input_offset,               // Indirection input
-        0,                                                // chunk offset
-        nullptr,                                          // padding pointer
+        reference.lhs_qai8_indirect_offset,               // chunk offset
+        reference.lhs_qai8_indirect_padding.data(),       // padding pointer
         packed.data() + dst_offset);
 
     return packed;
@@ -854,7 +896,9 @@ TEST_P(IndirectMatMulQuantizedTest, EndToEnd) {
         GTEST_SKIP() << "CPU features are not supported by current CPU";
     }
 
-    const TestReference& reference = get_test_reference(shape, variant.acc_pack, k_chunk.length);
+    // Toggle padding testst when LHS has more than one row
+    TestDataId test_data_id{shape, variant.acc_pack, k_chunk.length, shape.m > 1};
+    const TestReference& reference = get_test_reference(test_data_id);
     const Rect portion = output_portion.compute_portion(shape.m, shape.n, variant.acc_step.m, variant.acc_step.n);
 
     Buffer packed_lhs = imatmul::lhs_pack(variant.lhs_pack, portion, reference, shape.m, k_chunk);