From 830a15911c17b19d8b9054a2a2d42df3758599fa Mon Sep 17 00:00:00 2001
From: Emil Ohlsson <emil.ohlsson@arm.com>
Date: Thu, 3 Apr 2025 13:12:34 +0200
Subject: [PATCH 1/2] Add IGEMM padding pointer testing

For LHS shapes which has more than one row, set the first row of data to
be padding.

As this further increases the number of different test inputs this
change also extends the caching mechanism to use an unordered map to
store the generated test data, and uses a single object which
encompasses all parameters used to generate test data

Signed-off-by: Emil Ohlsson <emil.ohlsson@arm.com>
---
 test/common/test_suite.hpp                    | 16 ++++
 test/reference/matmul.cpp                     | 33 ++++---
 test/reference/matmul.hpp                     |  8 +-
 .../matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp  | 87 ++++++++++++++-----
 4 files changed, 107 insertions(+), 37 deletions(-)
diff --git a/test/common/test_suite.hpp b/test/common/test_suite.hpp
index cc791037..1bc85177 100644
--- a/test/common/test_suite.hpp
+++ b/test/common/test_suite.hpp
@@ -76,6 +76,22 @@ struct MatMulShape {
     size_t m{};  ///< LHS height.
     size_t n{};  ///< RHS width.
     size_t k{};  ///< LHS width and RHS height.
+private:
+    friend bool operator==(const MatMulShape& lhs, const MatMulShape& rhs) {
+        return                 //
+            lhs.m == rhs.m &&  //
+            lhs.n == rhs.n &&  //
+            lhs.k == rhs.k;
+    }
+};
+
+struct HashMatMulShape {
+    size_t operator()(const kai::test::MatMulShape& shape) const {
+        return                                     //
+            (std::hash<size_t>{}(shape.m) << 0) ^  //
+            (std::hash<size_t>{}(shape.n) << 1) ^  //
+            (std::hash<size_t>{}(shape.k) << 2);
+    }
 };
 
 /// Matrix multiplication test information.
diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp
index f7888e29..e735b23f 100644
--- a/test/reference/matmul.cpp
+++ b/test/reference/matmul.cpp
@@ -189,7 +189,8 @@ template <
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
 std::vector<uint8_t> indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
@@ -199,27 +200,32 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
 
     std::vector<uint8_t> dst(m * n * sizeof(DstData));
 
-    for (size_t y = 0; y < m; ++y) {
-        for (size_t x = 0; x < n; ++x) {
+    for (size_t i_m = 0; i_m < m; ++i_m) {
+        for (size_t i_n = 0; i_n < n; ++i_n) {
             DstData acc = 0;
 
             for (size_t i_k_chunk = 0; i_k_chunk < k_chunk_count; ++i_k_chunk) {
-                const void* lhs_data = *(lhs_ptrs + (y * k_chunk_count + i_k_chunk));
+                // Calculate the K chunk pointer. Apply offset if this is not padding
+                const size_t k_chunk_idx = i_m * k_chunk_count + i_k_chunk;
+                const void* k_chunk_ptr = lhs_ptrs[k_chunk_idx];
+                if (k_chunk_ptr != lhs_padding) {
+                    k_chunk_ptr = reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(k_chunk_ptr) + lhs_offset);
+                }
 
                 for (size_t i_k_chunk_len = 0; i_k_chunk_len < k_chunk_length; ++i_k_chunk_len) {
                     const size_t i = i_k_chunk * k_chunk_length + i_k_chunk_len;
 
                     const auto lhs_data_index = i_k_chunk_len;
-                    const auto lhs_quant_index = (y / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width;
-                    const auto lhs_value = read_array<LhsData>(lhs_data, lhs_data_index);
+                    const auto lhs_quant_index = (i_m / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width;
+                    const auto lhs_value = read_array<LhsData>(k_chunk_ptr, lhs_data_index);
                     const auto lhs_scale = lhs_scales != nullptr ? read_array<LhsScale>(lhs_scales, lhs_quant_index)
                                                                  : static_cast<LhsScale>(1);
                     const auto lhs_zero_point = lhs_zero_points != nullptr
                         ? read_array<LhsZeroPoint>(lhs_zero_points, lhs_quant_index)
                         : static_cast<LhsZeroPoint>(0);
 
-                    const auto rhs_data_index = x * (k_chunk_count * k_chunk_length) + i;
-                    const auto rhs_quant_index = (x / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width;
+                    const auto rhs_data_index = i_n * (k_chunk_count * k_chunk_length) + i;
+                    const auto rhs_quant_index = (i_n / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width;
                     const auto rhs_value = read_array<RhsData>(rhs_data, rhs_data_index);
                     const auto rhs_scale = rhs_scales != nullptr ? read_array<RhsScale>(rhs_scales, rhs_quant_index)
                                                                  : static_cast<RhsScale>(1);
@@ -235,19 +241,19 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
             }
 
             if (bias_data != nullptr) {
-                const auto bias_value = read_array<BiasData>(bias_data, x);
+                const auto bias_value = read_array<BiasData>(bias_data, i_n);
                 const auto bias_scale = bias_scales != nullptr
-                    ? read_array<BiasScale>(bias_scales, x / bias_quant_width)
+                    ? read_array<BiasScale>(bias_scales, i_n / bias_quant_width)
                     : static_cast<BiasScale>(1);
                 const auto bias_zero_point = bias_zero_points != nullptr
-                    ? read_array<BiasZeroPoint>(bias_zero_points, x / bias_quant_width)
+                    ? read_array<BiasZeroPoint>(bias_zero_points, i_n / bias_quant_width)
                     : static_cast<BiasZeroPoint>(0);
 
                 acc += (static_cast<DstData>(bias_value) - static_cast<DstData>(bias_zero_point)) *
                     static_cast<DstData>(bias_scale);
             }
 
-            write_array<DstData>(dst.data(), y * n + x, acc);
+            write_array<DstData>(dst.data(), i_m * n + i_n, acc);
         }
     }
 
@@ -331,7 +337,8 @@ matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, f
 template std::vector<uint8_t>
 indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp
index 913de1f8..8d83e98c 100644
--- a/test/reference/matmul.hpp
+++ b/test/reference/matmul.hpp
@@ -122,7 +122,12 @@ std::vector<uint8_t> matmul_clamp_nt_t(
 /// @param[in] m The LHS and output height.
 /// @param[in] n The RHS height and output width.
 /// @param[in] k The LHS and RHS width.
+/// @param[in] k_chunk_count Number of K chunk pointers per row in lhs_idata matrix
+/// @param[in] k_chunk_length Lenght of each K chunk pointed to in lhs_idata matrix
 /// @param[in] lhs_data The LHS data matrix.
+/// @param[in] lhs_idata The indirect LHS data matrix.
+/// @param[in] lhs_offset The indirection LHS data matrix offset, applied to non-padding pointers
+/// @parma[in] lhs_padding The indirection LHS padding chunk pointer
 /// @param[in] lhs_scales The LHS quantization scales matrix.
 /// @param[in] lhs_zero_points The LHS quantization zero points matrix.
 /// @param[in] lhs_quant_width The LHS quantization block width.
@@ -161,7 +166,8 @@ template <
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
 std::vector<uint8_t> indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
-    const void* const* lhs_ptrs, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
+    const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
+    const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height,
     size_t rhs_quant_width,  //
diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
index d71193a9..07efff3a 100644
--- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
@@ -336,6 +336,8 @@ struct TestReference {
     Buffer lhs_qai8_zero_points;
     IndirectionBuffer lhs_qai8_indirect;
     Buffer lhs_qai8_indirect_packed;
+    Buffer lhs_qai8_indirect_padding;
+    size_t lhs_qai8_indirect_offset;
 
     Buffer rhs_qsi8;
     Buffer rhs_scales;
@@ -380,30 +382,59 @@ static const kai_imatmul_clamp_qai8_qai8p_qsi8cxp_ukernel
         .run_matmul = kai_run_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa,
 };
 
-// M, N, K, k_chunk_length, pack.m, pack.n, pack.k
-using TestDataId = std::tuple<size_t, size_t, size_t, size_t, size_t, size_t, size_t>;
+static constexpr int8_t padding_value = 0;
+
+// Functionality for hashing generated test data.
+// This is particularly useful for portion testing
+// which reuses the exact same data for all portions
+struct TestDataId {
+    MatMulShape shape;
+    MatMulShape shape_pack;
+    size_t chunk_len;
+    bool pad_testing;
+
+private:
+    friend bool operator==(const TestDataId& lhs, const TestDataId& rhs) {
+        return                                   //
+            lhs.shape == rhs.shape &&            //
+            lhs.shape_pack == rhs.shape_pack &&  //
+            lhs.chunk_len == rhs.chunk_len &&    //
+            lhs.pad_testing == rhs.pad_testing;
+    }
+};
+
+struct HashTestDataId {
+    size_t operator()(const TestDataId& id) const {
+        return                                          //
+            (HashMatMulShape{}(id.shape) << 0) ^        //
+            (HashMatMulShape{}(id.shape_pack) << 1) ^   //
+            (std::hash<size_t>{}(id.chunk_len) << 2) ^  //
+            (std::hash<bool>{}(id.pad_testing) << 3);
+    }
+};
+
 // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
-static std::map<TestDataId, TestReference> g_data;
+static std::unordered_map<TestDataId, TestReference, HashTestDataId> g_data;
 // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)
 
 /// Generate test reference data
-static const TestReference& get_test_reference(
-    const MatMulShape& shape, const MatMulShape& pack_shape, size_t k_chunk_len) {
+static const TestReference& get_test_reference(const TestDataId& test_data_id) {
     // ============================================================
     // Generates input and reference output data
     // ============================================================
 
     // Attempt to find test data in cache
-    const TestDataId data_id{shape.m, shape.n, shape.k, k_chunk_len, pack_shape.m, pack_shape.n, pack_shape.k};
-    const auto data_it = g_data.find(data_id);
+    const auto data_it = g_data.find(test_data_id);
     if (data_it != g_data.end()) {
         return data_it->second;
     }
 
+    const auto& [shape, pack_shape, k_chunk_len, pad_testing] = test_data_id;
+
     // Generates the input data in floating-point.
-    const auto lhs_f32 = fill_random<float>(shape.m * shape.k, seed);
-    const auto rhs_f32 = fill_random<float>(shape.k * shape.n, seed);
-    const auto bias_f32 = fill_random<float>(shape.n, seed);
+    Buffer lhs_f32 = fill_random<float>(shape.m * shape.k, seed);
+    const Buffer rhs_f32 = fill_random<float>(shape.k * shape.n, seed);
+    const Buffer bias_f32 = fill_random<float>(shape.n, seed);
 
     // Quantizes the input data.
     //   * LHS: 8-bit asymmetric per-matrix quantization.
@@ -417,18 +448,26 @@ static const TestReference& get_test_reference(
     const auto lhs_scale = read_array<float>(lhs_qai8_scales.data(), 0);
     const auto lhs_zero_point = read_array<int32_t>(lhs_qai8_zero_points.data(), 0);
 
-    IndirectionBuffer lhs_qai8_indirect;
-
     const size_t k_chunk_count = shape.k / k_chunk_len;
     assert(k_chunk_count * k_chunk_len == shape.k);
 
     // Setup an indirection buffer, where each "row" contains `k_chunk_count`
     // pointers to chunks of length `k_chunk_len` in the input_buffer
+    IndirectionBuffer lhs_qai8_indirect(shape.m * k_chunk_count);
+    Buffer lhs_padding(k_chunk_len, padding_value);
     for (size_t m_i = 0; m_i < shape.m; ++m_i) {
         for (size_t k_chunk_idx = 0; k_chunk_idx < k_chunk_count; ++k_chunk_idx) {
-            lhs_qai8_indirect.push_back(&lhs_qai8.at(m_i * shape.k + k_chunk_idx * k_chunk_len));
+            const size_t idx = m_i * k_chunk_count + k_chunk_idx;
+            if (pad_testing and m_i == 0) {
+                // Push padding pointers for first row
+                lhs_qai8_indirect[idx] = lhs_padding.data();
+            } else {
+                uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len;
+                lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(offset);
+            }
         }
     }
+    const auto indirection_base = reinterpret_cast<uintptr_t>(lhs_qai8.data());
 
     // Reorder indirection pointers to layout the packing kernel expectes
     Buffer lhs_qai8_indirect_packed = reorder_block<const void*>(
@@ -449,11 +488,12 @@ static const TestReference& get_test_reference(
         quantize_symmetric_per_block<float, int32_t, float>(bias_f32.data(), bias_scales.data(), shape.n, 1, 1);
 
     // Runs the reference implementation of matmul to produce floating-point result.
+    const void* const* lhs_iptr = reinterpret_cast<const void* const*>(lhs_qai8_indirect.data());
     const auto ref_dst_f32 =
         indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
-            shape.m, shape.n, k_chunk_count, k_chunk_len,  // matmul shape
-            reinterpret_cast<const void* const*>(lhs_qai8_indirect.data()), &lhs_scale,
-            &lhs_zero_point,                                 // LHS, scaling factor and zero point
+            shape.m, shape.n, k_chunk_count, k_chunk_len,    // matmul shape
+            lhs_iptr, indirection_base, lhs_padding.data(),  // LHS indirection, offset and padding
+            &lhs_scale, &lhs_zero_point,                     // LHS, scaling factor and zero point
             shape.m, shape.k,                                // LHS quantization window shape
             rhs_qsi8_t.data(), rhs_scales.data(), nullptr,   // RHS scaling factors
             1, shape.k,                                      // RHS quantization window shape
@@ -505,7 +545,7 @@ static const TestReference& get_test_reference(
         rhs_qsi8_t.data(), rhs_scales.data(), lhs_scale, dst_scale, bias_qsi32.data(), lhs_zero_point, shape.n, shape.k,
         pack_shape.n, pack_shape.k);
 
-    const TestReference& reference = g_data[data_id] = {
+    const TestReference& reference = g_data[test_data_id] = {
         .clamp = {.min = dst_qai8_clamp_min, .max = dst_qai8_clamp_max},
 
         .qa_lhs = {.scale = lhs_scale, .zero_point = lhs_zero_point},
@@ -516,6 +556,8 @@ static const TestReference& get_test_reference(
         .lhs_qai8_zero_points = std::move(lhs_qai8_zero_points),
         .lhs_qai8_indirect = std::move(lhs_qai8_indirect),
         .lhs_qai8_indirect_packed = std::move(lhs_qai8_indirect_packed),
+        .lhs_qai8_indirect_padding = std::move(lhs_padding),
+        .lhs_qai8_indirect_offset = indirection_base,
 
         .rhs_qsi8 = std::move(rhs_qsi8),
         .rhs_scales = std::move(rhs_scales),
@@ -715,7 +757,7 @@ TEST_P(MatMulQuantizedTest, EndToEnd) {
         GTEST_SKIP() << "CPU features are not supported by current CPU";
     }
 
-    TestReference reference = get_test_reference(shape, variant.acc_pack, 1);
+    TestReference reference = get_test_reference({shape, variant.acc_pack, shape.k, false});
 
     // Check scheduling parameters
     const auto imp_mr = variant.matmul.get_mr();
@@ -762,13 +804,11 @@ static Buffer lhs_pack(
     const size_t input_offset = portion.start_row() * k_chunk.count;
     const size_t dst_offset = variant.get_packed_lhs_offset(portion.start_row(), k_chunk.count, k_chunk.length);
 
-    // TODO: `lhs_offset` is currently not being excercized!
-    // TODO: Ensure that `zero` pointers are tested
     variant.pack(
         portion.height(), k_chunk.count, k_chunk.length,  // Dimensions
         indirection_pointer + input_offset,               // Indirection input
-        0,                                                // chunk offset
-        nullptr,                                          // padding pointer
+        reference.lhs_qai8_indirect_offset,               // chunk offset
+        reference.lhs_qai8_indirect_padding.data(),       // padding pointer
         packed.data() + dst_offset);
 
     return packed;
@@ -854,7 +894,8 @@ TEST_P(IndirectMatMulQuantizedTest, EndToEnd) {
         GTEST_SKIP() << "CPU features are not supported by current CPU";
     }
 
-    const TestReference& reference = get_test_reference(shape, variant.acc_pack, k_chunk.length);
+    // Toggle padding testst when LHS has more than one row
+    const TestReference& reference = get_test_reference({shape, variant.acc_pack, k_chunk.length, shape.m > 1});
     const Rect portion = output_portion.compute_portion(shape.m, shape.n, variant.acc_step.m, variant.acc_step.n);
 
     Buffer packed_lhs = imatmul::lhs_pack(variant.lhs_pack, portion, reference, shape.m, k_chunk);
-- 
GitLab


From ca590a2f688e4691f731958f3a164f03c33bf355 Mon Sep 17 00:00:00 2001
From: Emil Ohlsson <emil.ohlsson@arm.com>
Date: Fri, 4 Apr 2025 12:26:28 +0200
Subject: [PATCH 2/2] Address compiler warnings

Signed-off-by: Emil Ohlsson <emil.ohlsson@arm.com>
---
 test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
index 07efff3a..e7ded92c 100644
--- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
@@ -16,6 +16,7 @@
 #include <string>
 #include <string_view>
 #include <tuple>
+#include <unordered_map>
 #include <vector>
 
 #include "kai/kai_common.h"
@@ -757,7 +758,8 @@ TEST_P(MatMulQuantizedTest, EndToEnd) {
         GTEST_SKIP() << "CPU features are not supported by current CPU";
     }
 
-    TestReference reference = get_test_reference({shape, variant.acc_pack, shape.k, false});
+    TestDataId test_data_id{shape, variant.acc_pack, shape.k, false};
+    const TestReference& reference = get_test_reference(test_data_id);
 
     // Check scheduling parameters
     const auto imp_mr = variant.matmul.get_mr();
@@ -895,7 +897,8 @@ TEST_P(IndirectMatMulQuantizedTest, EndToEnd) {
     }
 
     // Toggle padding testst when LHS has more than one row
-    const TestReference& reference = get_test_reference({shape, variant.acc_pack, k_chunk.length, shape.m > 1});
+    TestDataId test_data_id{shape, variant.acc_pack, k_chunk.length, shape.m > 1};
+    const TestReference& reference = get_test_reference(test_data_id);
     const Rect portion = output_portion.compute_portion(shape.m, shape.n, variant.acc_step.m, variant.acc_step.n);
 
     Buffer packed_lhs = imatmul::lhs_pack(variant.lhs_pack, portion, reference, shape.m, k_chunk);
-- 
GitLab