From e151cd142e781d1c74043ddccc3605bfae6a566e Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 20 May 2025 12:23:35 +0100
Subject: [PATCH 01/18] Add Conv2D example using FP16 IGEMM

- Example demonstrates creating an indirect buffer using a Conv2D input tensor
- Example demonstrates indirect buffer usage with imatmul kernels.

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../CMakeLists.txt                            |  44 +++
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 323 ++++++++++++++++++
 2 files changed, 367 insertions(+)
 create mode 100644 examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
 create mode 100644 examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
new file mode 100644
index 00000000..edc50297
--- /dev/null
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
@@ -0,0 +1,44 @@
+#
+# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cmake_minimum_required(VERSION 3.16)
+
+project(matmul_clamp_f16_f16_f16p)
+
+set(CMAKE_CXX_STANDARD 17)
+set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../)
+set(KAI_BUILD ${KAI_PATH}/build)
+set(CMAKE_BUILD_TYPE Debug)
+
+include_directories(
+    ${KAI_PATH}/
+    ${KAI_PATH}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/
+    ${KAI_PATH}/kai/ukernels/matmul/pack/)
+
+set(KAI_SOURCES
+    ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
+    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c
+    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c)
+
+set(KAI_HEADERS
+    ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h
+    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h
+    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h)
+
+# Files requires to build the executable
+add_executable(
+    conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp
+    ${KAI_SOURCES}
+    ${KAI_HEADERS}
+    )
+
+target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p
+    PRIVATE -march=armv8.2-a+sve+sve2
+)
+
+target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p
+    PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
+)
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
new file mode 100644
index 00000000..b554ccb2
--- /dev/null
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -0,0 +1,323 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// Example usage for Indirect GEMM with a convolution operation using two half-precision matrices.
+//
+
+#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)
+#error This file must be compiled for AArch64, FEAT_SVE2.
+#else  // Architectural features check.
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+// Include micro-kernel variants
+#include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
+#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h"
+#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h"
+
+using VEC_F16 = std::vector<float16_t>;
+
+namespace {
+
+constexpr float clamp_min = (float)-65504;
+constexpr float clamp_max = (float)65504;
+
+struct Shape {
+    int n;
+    int h;
+    int w;
+    int c;
+    int co = 1;  // Used for only kernel shape.
+    [[nodiscard]] auto size() const -> size_t {
+        return n * h * w * c * co;
+    }
+    std::string print() const {
+        std::stringstream ss;
+        ss << " [ " << n << " , " << h << " ," << w << " , " << c << ", " << co << " ] ";
+        return ss.str();
+    }
+};
+
+struct ConvInfo {
+    // Conv info - we assume the following args.
+    int stride_x = 1;
+    int stride_y = 1;
+    int pad_left = 0;
+    int pad_top = 0;
+    int pad_right = 0;
+    int pad_bottom = 0;
+    int dilation_x = 1;
+    int dilation_y = 1;
+};
+
+/// Perform a convolution operation in nhwc data format.
+/// @param[in] src Shape of the input tensor in [N, H, W, C] DataFormat
+/// @param[in] weights Shape of the weights tensor in [1, H, W, CI, CO] Format
+/// @param[in] dst Shape of the output tensor in [N, H, W, C] DataFormat
+/// @param[in] in half float pointer to start of input tensor
+/// @param[in] wei half float pointer to start of weights tensor
+/// @param[in] bias half float pointer to start of bias tensor
+/// @param[out] out half float pointer to start of output tensor
+/// @param[in] clamp_min Minimum value to clamp final result
+/// @param[in] clamp_max Max value to clamp final result
+/// @param[in] cinfo Input arguments for convolution
+void convolution_layer_nhwc(
+    Shape src, Shape weights, Shape dst, const float16_t* in, const float16_t* wei, const float16_t* bias,
+    float16_t* out, float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo = ConvInfo()) {
+    assert(cinfo.stride_x == 1 && cinfo.stride_y == 1);
+    assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1);
+    assert(cinfo.pad_left == 0 && cinfo.pad_right == 0);
+    assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0);
+
+    for (int n = 0; n < src.n; ++n) {
+        for (int oh = 0; oh < dst.h; ++oh) {
+            for (int ow = 0; ow < dst.w; ++ow) {
+                for (int kh = 0; kh < weights.h; ++kh) {
+                    if (src.h <= (oh + kh)) continue;
+                    for (int kw = 0; kw < weights.w; ++kw) {
+                        if (src.w <= (ow + kw)) continue;
+
+                        for (int ic = 0; ic < src.c; ++ic) {
+                            auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic;
+                            auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c);
+                            auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
+
+                            for (int oc = 0; oc < dst.c; ++oc) {
+                                // acc here.
+                                out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]);
+                            }
+                        }
+                    }
+                }
+
+                // Accumulate bias here.
+                for (int oc = 0; oc < dst.c; ++oc) {
+                    // acc here.
+                    auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
+                    out[out_idx + oc] += bias[oc];
+                }
+            }
+        }
+    }
+
+    // Loop and clamp output data.
+    for (int i = 0; i < dst.size(); i++) {
+        out[i] = (out[i] < clamp_min) ? clamp_min : out[i];
+        out[i] = (out[i] > clamp_max) ? clamp_max : out[i];
+    }
+}
+
+/// Fills the matrix with incremental values
+void fill_matrix(size_t size, float16_t* dst, const float16_t weight) {
+    for (size_t i = 0; i < size; i++) {
+        dst[i] = float16_t(i * weight);
+    }
+}
+
+/// Print the matrix
+void print_matrix(size_t num_rows, size_t num_cols, const char* name, const float16_t* src) {
+    std::cout << name << " = [\n";
+    for (size_t y = 0; y < num_rows; ++y) {
+        std::cout << "  [";
+        for (size_t x = 0; x < num_cols; ++x) {
+            std::cout << std::setprecision(0) << std::fixed << src[y * num_cols + x] << ", ";
+        }
+        std::cout << ("],\n");
+    }
+    std::cout << ("]\n\n");
+}
+
+/// Print the matrix
+void print_matrix(Shape shape, const char* name, const float16_t* src) {
+    std::cout << name << " = [\n";
+    for (size_t n = 0; n < shape.n; n++) {
+        std::cout << "\n";
+        for (size_t y = 0; y < shape.h; ++y) {
+            std::cout << "  [";
+            for (size_t x = 0; x < shape.w; x++) {
+                std::cout << "[";
+                for (size_t c = 0; c < shape.c; c++) {
+                    if (c != 0) std::cout << ",";
+                    std::cout << std::setprecision(0) << std::fixed
+                              << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c];
+                }
+                std::cout << "] ";
+            }
+            std::cout << ("],\n");
+        }
+    }
+    std::cout << ("]\n\n");
+}
+
+// Verify the micro-kernel output matches the reference implementation
+bool is_output_correct(
+    size_t num_rows, size_t num_cols, const float16_t tolerance, const float16_t* ref, const float16_t* act) {
+    bool is_valid = true;
+    int count = 0;
+    for (size_t i = 0; i < num_rows * num_cols; ++i) {
+        if ((std::fabs((ref[i] - act[i]) / act[i])) > tolerance) {
+            const size_t x = i % num_cols;
+            const size_t y = i / num_cols;
+            count++;
+            std::cout << std::setprecision(5) << std::fixed << "ERROR![" << y << "][" << x << "]: ref=" << ref[i]
+                      << " vs. act=" << act[i] << "\n";
+
+            is_valid = false;
+        }
+    }
+    std::cout << "\n\nThere are " << count << " mismatches." << std::endl;
+    return is_valid;
+}
+}  // namespace
+
+size_t round_up_division(size_t a, size_t b) {
+    return (a + b - 1) / b;
+}
+
+int main() {
+    // Input tensor in {NHWC} format.
+    Shape input{5, 32, 32, 3};    // Layout : [N, H, W, C]
+    Shape kernel{1, 3, 3, 3, 2};  //  Layout : [1, KH, KW, CI, CO]
+    Shape output{5, 30, 30, 2};   // Layout : [N, H, W, C]
+
+    // This example only supports default conv arguments.
+    const ConvInfo conv_info;
+
+    // Init Input buffers.
+    VEC_F16 in(input.size());
+    VEC_F16 filter(kernel.size());
+    VEC_F16 bias(output.c);
+    fill_matrix(in.size(), in.data(), 0.1f);
+    fill_matrix(filter.size(), filter.data(), 0.01f);
+    fill_matrix(bias.size(), bias.data(), 1.f);
+
+    // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1)
+    // N - Equivalent to output.c
+    // K - Width of LHS after im2col - is equivalent to (input_shape.c * kernel.w * kernel.w) when num_groups = 0 and no
+    // padding
+    const size_t M = input.n * ((input.w - kernel.w) + 1) * ((input.h - kernel.h) + 1);
+    const size_t K = (input.c * kernel.w * kernel.h);
+    const size_t N = kernel.size() / K;
+
+    // Chunking is done in channel dimension (lowest input dim)
+    const size_t k_chunk_length = input.c;
+    const size_t k_chunk_count = K / k_chunk_length;
+
+    // Check all shapes are valid.
+    assert(output.size() == M * N);
+    assert(input.n == output.n);
+    assert(kernel.c == input.c && kernel.co == output.c);
+    assert(kernel.n == 1);
+
+    // ------------------------------------------
+    // 1. Pack LHS - Create Indirection buffer.
+    // ------------------------------------------
+
+    const size_t m_step = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme();
+    const size_t itable_rows = k_chunk_count * round_up_division(M, m_step);
+    const size_t itable_cols = m_step;
+
+    const size_t out_hw_size = output.h * output.w;
+    const size_t in_hwc_size = input.size() / input.n;
+
+    float16_t* zero_buffer = in.data();
+
+    // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step.
+    // Number of blocks is = round_up_division(M, m_step)
+    const size_t block_size = k_chunk_count * m_step;
+    std::vector<float16_t*> indirect_table(itable_cols * itable_rows, zero_buffer);
+
+    for (size_t batch_idx = 0; batch_idx < output.n; batch_idx++) {
+        // We iterate over OH * OW dims and retrieve a pointer to relevant input index.
+        for (size_t out_idx = 0; out_idx < out_hw_size; out_idx++) {
+            const size_t output_x = out_idx % output.w;
+            const size_t output_y = out_idx / output.w;
+
+            // Calculates column and row offsets for itable index with respect to block
+            size_t block_start_x = (((batch_idx * out_hw_size) + out_idx) % m_step);
+            size_t block_start_y = (((batch_idx * out_hw_size) + out_idx) / m_step);
+
+            // These filter loops will fill the indirection table column-wise for kh*kw elements.
+            for (size_t kernel_y = 0; kernel_y < kernel.h; kernel_y++) {
+                const size_t input_y =
+                    output_y * conv_info.stride_y + kernel_y * conv_info.dilation_y - conv_info.pad_top;
+                if (input_y < input.h) {
+                    for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) {
+                        size_t input_x =
+                            output_x * conv_info.stride_x + kernel_x * conv_info.dilation_x - conv_info.pad_left;
+                        size_t kernel_index = kernel_y * kernel.w + kernel_x;
+
+                        size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step;
+
+                        if (input_x < input.w) {
+                            indirect_table[index] =
+                                ((float16_t*)in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c +
+                                 input_x * input.c);
+                        } else {
+                            indirect_table[index] = zero_buffer;
+                        }
+                    }
+                } else {
+                    for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) {
+                        size_t kernel_index = kernel_y * kernel.w + kernel_x;
+                        size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step;
+                        indirect_table[index] = zero_buffer;
+                    }
+                }
+            }
+        }
+    }
+
+    // Init Output buffers.
+    VEC_F16 act_output(output.size());
+    VEC_F16 ref_output(output.size());
+
+    // -------------------------------------------------
+    // 1b. Pack LHS and RHS.
+    // -------------------------------------------------
+
+    // Initialise LHS Packed buffer and call packing kernel.
+    auto lhs_packed_size =
+        kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(M, k_chunk_count, k_chunk_length);
+    auto rhs_packed_size =
+        kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(N, k_chunk_count, k_chunk_length);
+
+    VEC_F16 packed_lhs(lhs_packed_size);
+    VEC_F16 packed_rhs(rhs_packed_size);
+
+    kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme(
+        M, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr, packed_lhs.data());
+    kai_run_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(
+        N, k_chunk_count, k_chunk_length, N * sizeof(float16_t), filter.data(), bias.data(), packed_rhs.data());
+
+    // -------------------------------------------------
+    // 2. Perform matmul operation.
+    // -------------------------------------------------
+
+    kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa(
+        M, N, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(), act_output.data(),
+        N * sizeof(float16_t), clamp_min, clamp_max);
+
+    // -------------------------------------------------
+    // 3. Call reference and compare output.
+    // -------------------------------------------------
+    convolution_layer_nhwc(
+        input, kernel, output, in.data(), filter.data(), bias.data(), ref_output.data(), clamp_min, clamp_max);
+
+    print_matrix(output, "\nTarget : ", act_output.data());
+    print_matrix(output, "\nREf : ", ref_output.data());
+
+    is_output_correct(M, N, 0.01f, ref_output.data(), act_output.data());
+
+    return 0;
+}
+
+#endif  // Architectural features check.
-- 
GitLab


From 78e669006065bc848991200bda01918b6dd8b624 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 20 May 2025 12:23:35 +0100
Subject: [PATCH 02/18] Add Conv2D example using FP16 IGEMM

- Example demonstrates creating an indirect buffer using a Conv2D input tensor
- Example demonstrates indirect buffer usage with imatmul kernels.

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt       | 2 +-
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
index edc50297..916e2e38 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index b554ccb2..810c5caa 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
-- 
GitLab


From 29a8d5778856f27410f6002af3b6d719a424687c Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Thu, 22 May 2025 15:31:02 +0100
Subject: [PATCH 03/18] Made suggested changes

- Removed unused headers, removed unnecessary vectorization
- Used ostream to print instead of stringstream
- Removed usage of raw pointers where possible.
- Made other minor suggested fixes

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../CMakeLists.txt                            | 17 +---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 85 ++++++++++---------
 2 files changed, 47 insertions(+), 55 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
index 916e2e38..be52dae3 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
@@ -6,37 +6,28 @@
 
 cmake_minimum_required(VERSION 3.16)
 
-project(matmul_clamp_f16_f16_f16p)
+project(conv2d_imatmul_clamp_f16_f16_f16p)
 
 set(CMAKE_CXX_STANDARD 17)
 set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../)
 set(KAI_BUILD ${KAI_PATH}/build)
-set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_BUILD_TYPE Release)
 
-include_directories(
-    ${KAI_PATH}/
-    ${KAI_PATH}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/
-    ${KAI_PATH}/kai/ukernels/matmul/pack/)
+include_directories(${KAI_PATH})
 
 set(KAI_SOURCES
     ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
     ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c
     ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c)
 
-set(KAI_HEADERS
-    ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h
-    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h
-    ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h)
-
 # Files requires to build the executable
 add_executable(
     conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp
     ${KAI_SOURCES}
-    ${KAI_HEADERS}
     )
 
 target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p
-    PRIVATE -march=armv8.2-a+sve+sve2
+    PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize"
 )
 
 target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 810c5caa..00439a5d 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -4,16 +4,17 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-// Example usage for Indirect GEMM with a convolution operation using two half-precision matrices.
+// Example usage for Indirect GEMM with a convolution operation using two half-precision float matrices.
 //
 
-#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)
-#error This file must be compiled for AArch64, FEAT_SVE2.
+#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \
+    !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#error This file must be compiled for AArch64, FEAT_FP16.
 #else  // Architectural features check.
 
-#include <arm_neon.h>
-#include <assert.h>
+#include <arm_fp16.h>
 
+#include <cassert>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -31,21 +32,22 @@ constexpr float clamp_min = (float)-65504;
 constexpr float clamp_max = (float)65504;
 
 struct Shape {
-    int n;
-    int h;
-    int w;
-    int c;
-    int co = 1;  // Used for only kernel shape.
+    size_t n;
+    size_t h;
+    size_t w;
+    size_t c;
+    size_t co = 1;  // Used for only kernel shape.
     [[nodiscard]] auto size() const -> size_t {
         return n * h * w * c * co;
     }
-    std::string print() const {
-        std::stringstream ss;
-        ss << " [ " << n << " , " << h << " ," << w << " , " << c << ", " << co << " ] ";
-        return ss.str();
-    }
+    friend std::ostream& operator<<(std::ostream& os, const Shape& shape);
 };
 
+std::ostream& operator<<(std::ostream& os, const Shape& shape) {
+    os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << ", " << shape.co << " ] ";
+    return os;
+}
+
 struct ConvInfo {
     // Conv info - we assume the following args.
     int stride_x = 1;
@@ -70,27 +72,27 @@ struct ConvInfo {
 /// @param[in] clamp_max Max value to clamp final result
 /// @param[in] cinfo Input arguments for convolution
 void convolution_layer_nhwc(
-    Shape src, Shape weights, Shape dst, const float16_t* in, const float16_t* wei, const float16_t* bias,
-    float16_t* out, float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo = ConvInfo()) {
+    Shape src, Shape weights, Shape dst, const VEC_F16& in, const VEC_F16& wei, const VEC_F16& bias, VEC_F16& out,
+    float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo) {
     assert(cinfo.stride_x == 1 && cinfo.stride_y == 1);
     assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1);
     assert(cinfo.pad_left == 0 && cinfo.pad_right == 0);
     assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0);
 
-    for (int n = 0; n < src.n; ++n) {
-        for (int oh = 0; oh < dst.h; ++oh) {
-            for (int ow = 0; ow < dst.w; ++ow) {
-                for (int kh = 0; kh < weights.h; ++kh) {
+    for (size_t n = 0; n < src.n; ++n) {
+        for (size_t oh = 0; oh < dst.h; ++oh) {
+            for (size_t ow = 0; ow < dst.w; ++ow) {
+                for (size_t kh = 0; kh < weights.h; ++kh) {
                     if (src.h <= (oh + kh)) continue;
-                    for (int kw = 0; kw < weights.w; ++kw) {
+                    for (size_t kw = 0; kw < weights.w; ++kw) {
                         if (src.w <= (ow + kw)) continue;
 
-                        for (int ic = 0; ic < src.c; ++ic) {
+                        for (size_t ic = 0; ic < src.c; ++ic) {
                             auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic;
                             auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c);
                             auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
 
-                            for (int oc = 0; oc < dst.c; ++oc) {
+                            for (size_t oc = 0; oc < dst.c; ++oc) {
                                 // acc here.
                                 out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]);
                             }
@@ -99,7 +101,7 @@ void convolution_layer_nhwc(
                 }
 
                 // Accumulate bias here.
-                for (int oc = 0; oc < dst.c; ++oc) {
+                for (size_t oc = 0; oc < dst.c; ++oc) {
                     // acc here.
                     auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
                     out[out_idx + oc] += bias[oc];
@@ -109,21 +111,21 @@ void convolution_layer_nhwc(
     }
 
     // Loop and clamp output data.
-    for (int i = 0; i < dst.size(); i++) {
+    for (size_t i = 0; i < dst.size(); i++) {
         out[i] = (out[i] < clamp_min) ? clamp_min : out[i];
         out[i] = (out[i] > clamp_max) ? clamp_max : out[i];
     }
 }
 
 /// Fills the matrix with incremental values
-void fill_matrix(size_t size, float16_t* dst, const float16_t weight) {
+void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) {
     for (size_t i = 0; i < size; i++) {
         dst[i] = float16_t(i * weight);
     }
 }
 
 /// Print the matrix
-void print_matrix(size_t num_rows, size_t num_cols, const char* name, const float16_t* src) {
+void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) {
     std::cout << name << " = [\n";
     for (size_t y = 0; y < num_rows; ++y) {
         std::cout << "  [";
@@ -136,7 +138,7 @@ void print_matrix(size_t num_rows, size_t num_cols, const char* name, const floa
 }
 
 /// Print the matrix
-void print_matrix(Shape shape, const char* name, const float16_t* src) {
+void print_tensor(Shape shape, const char* name, const VEC_F16& src) {
     std::cout << name << " = [\n";
     for (size_t n = 0; n < shape.n; n++) {
         std::cout << "\n";
@@ -145,7 +147,7 @@ void print_matrix(Shape shape, const char* name, const float16_t* src) {
             for (size_t x = 0; x < shape.w; x++) {
                 std::cout << "[";
                 for (size_t c = 0; c < shape.c; c++) {
-                    if (c != 0) std::cout << ",";
+                    if (c != 0) std::cout << " , ";
                     std::cout << std::setprecision(0) << std::fixed
                               << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c];
                 }
@@ -159,7 +161,7 @@ void print_matrix(Shape shape, const char* name, const float16_t* src) {
 
 // Verify the micro-kernel output matches the reference implementation
 bool is_output_correct(
-    size_t num_rows, size_t num_cols, const float16_t tolerance, const float16_t* ref, const float16_t* act) {
+    size_t num_rows, size_t num_cols, const float16_t tolerance, const VEC_F16& ref, const VEC_F16& act) {
     bool is_valid = true;
     int count = 0;
     for (size_t i = 0; i < num_rows * num_cols; ++i) {
@@ -195,9 +197,9 @@ int main() {
     VEC_F16 in(input.size());
     VEC_F16 filter(kernel.size());
     VEC_F16 bias(output.c);
-    fill_matrix(in.size(), in.data(), 0.1f);
-    fill_matrix(filter.size(), filter.data(), 0.01f);
-    fill_matrix(bias.size(), bias.data(), 1.f);
+    fill_matrix(in.size(), in, 0.1f);
+    fill_matrix(filter.size(), filter, 0.01f);
+    fill_matrix(bias.size(), bias, 1.f);
 
     // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1)
     // N - Equivalent to output.c
@@ -259,8 +261,7 @@ int main() {
 
                         if (input_x < input.w) {
                             indirect_table[index] =
-                                ((float16_t*)in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c +
-                                 input_x * input.c);
+                                (in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + input_x * input.c);
                         } else {
                             indirect_table[index] = zero_buffer;
                         }
@@ -309,13 +310,13 @@ int main() {
     // -------------------------------------------------
     // 3. Call reference and compare output.
     // -------------------------------------------------
-    convolution_layer_nhwc(
-        input, kernel, output, in.data(), filter.data(), bias.data(), ref_output.data(), clamp_min, clamp_max);
-
-    print_matrix(output, "\nTarget : ", act_output.data());
-    print_matrix(output, "\nREf : ", ref_output.data());
+    convolution_layer_nhwc(input, kernel, output, in, filter, bias, ref_output, clamp_min, clamp_max, conv_info);
 
-    is_output_correct(M, N, 0.01f, ref_output.data(), act_output.data());
+#ifdef KAI_DEBUG
+    print_tensor(output, "\nTarget : ", act_output);
+    print_tensor(output, "\nREf : ", ref_output);
+#endif  // KAI_DEBUG
+    is_output_correct(M, N, 0.01f, ref_output, act_output);
 
     return 0;
 }
-- 
GitLab


From df561020e9f9b42a0d7081cac682ce4e318e3399 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Thu, 22 May 2025 17:46:36 +0100
Subject: [PATCH 04/18] Use std::clamp

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt     | 1 -
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp                     | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
index be52dae3..2cf37a29 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
@@ -11,7 +11,6 @@ project(conv2d_imatmul_clamp_f16_f16_f16p)
 set(CMAKE_CXX_STANDARD 17)
 set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../)
 set(KAI_BUILD ${KAI_PATH}/build)
-set(CMAKE_BUILD_TYPE Release)
 
 include_directories(${KAI_PATH})
 
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 00439a5d..4ff917d4 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -14,6 +14,7 @@
 
 #include <arm_fp16.h>
 
+#include <algorithm>
 #include <cassert>
 #include <iomanip>
 #include <iostream>
@@ -112,8 +113,7 @@ void convolution_layer_nhwc(
 
     // Loop and clamp output data.
     for (size_t i = 0; i < dst.size(); i++) {
-        out[i] = (out[i] < clamp_min) ? clamp_min : out[i];
-        out[i] = (out[i] > clamp_max) ? clamp_max : out[i];
+        out[i] = std::clamp(out[i], clamp_min, clamp_max);
     }
 }
 
-- 
GitLab


From 6ca05c52d58010b9f21660c57216489b5e364036 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Wed, 28 May 2025 09:58:05 +0100
Subject: [PATCH 05/18] Apply suggested changes

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../CMakeLists.txt                            |   3 +-
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 355 +++++++++---------
 2 files changed, 184 insertions(+), 174 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
index 2cf37a29..aef754c9 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
@@ -22,8 +22,7 @@ set(KAI_SOURCES
 # Files requires to build the executable
 add_executable(
     conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp
-    ${KAI_SOURCES}
-    )
+    ${KAI_SOURCES})
 
 target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p
     PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize"
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 4ff917d4..2dec00dd 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -29,94 +29,139 @@ using VEC_F16 = std::vector<float16_t>;
 
 namespace {
 
-constexpr float clamp_min = (float)-65504;
-constexpr float clamp_max = (float)65504;
+constexpr float clamp_min = -65504.0F;
+constexpr float clamp_max = 65504.0F;
 
 struct Shape {
     size_t n;
     size_t h;
     size_t w;
     size_t c;
-    size_t co = 1;  // Used for only kernel shape.
     [[nodiscard]] auto size() const -> size_t {
-        return n * h * w * c * co;
+        return n * h * w * c;
     }
-    friend std::ostream& operator<<(std::ostream& os, const Shape& shape);
-};
-
-std::ostream& operator<<(std::ostream& os, const Shape& shape) {
-    os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << ", " << shape.co << " ] ";
-    return os;
-}
 
-struct ConvInfo {
-    // Conv info - we assume the following args.
-    int stride_x = 1;
-    int stride_y = 1;
-    int pad_left = 0;
-    int pad_top = 0;
-    int pad_right = 0;
-    int pad_bottom = 0;
-    int dilation_x = 1;
-    int dilation_y = 1;
+#ifdef KAI_DEBUG
+    friend std::ostream& operator<<(std::ostream& os, const Shape& shape) {
+        os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << " ] ";
+        return os;
+    }
+#endif
 };
 
 /// Perform a convolution operation in nhwc data format.
-/// @param[in] src Shape of the input tensor in [N, H, W, C] DataFormat
-/// @param[in] weights Shape of the weights tensor in [1, H, W, CI, CO] Format
-/// @param[in] dst Shape of the output tensor in [N, H, W, C] DataFormat
-/// @param[in] in half float pointer to start of input tensor
-/// @param[in] wei half float pointer to start of weights tensor
+/// @param[in] in_shape Shape of the input tensor in [N, H, W, C] DataFormat
+/// @param[in] out_shape Shape of the output tensor in [N, H, W, C] DataFormat
+/// @param[in] filter_height Height of convolution filter.
+/// @param[in] filter_width Width of convolution filter.
+/// @param[in] feature_map half float pointer to start of input tensor
+/// @param[in] weights half float pointer to start of weights tensor
 /// @param[in] bias half float pointer to start of bias tensor
 /// @param[out] out half float pointer to start of output tensor
 /// @param[in] clamp_min Minimum value to clamp final result
 /// @param[in] clamp_max Max value to clamp final result
-/// @param[in] cinfo Input arguments for convolution
 void convolution_layer_nhwc(
-    Shape src, Shape weights, Shape dst, const VEC_F16& in, const VEC_F16& wei, const VEC_F16& bias, VEC_F16& out,
-    float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo) {
-    assert(cinfo.stride_x == 1 && cinfo.stride_y == 1);
-    assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1);
-    assert(cinfo.pad_left == 0 && cinfo.pad_right == 0);
-    assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0);
-
-    for (size_t n = 0; n < src.n; ++n) {
-        for (size_t oh = 0; oh < dst.h; ++oh) {
-            for (size_t ow = 0; ow < dst.w; ++ow) {
-                for (size_t kh = 0; kh < weights.h; ++kh) {
-                    if (src.h <= (oh + kh)) continue;
-                    for (size_t kw = 0; kw < weights.w; ++kw) {
-                        if (src.w <= (ow + kw)) continue;
-
-                        for (size_t ic = 0; ic < src.c; ++ic) {
-                            auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic;
-                            auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c);
-                            auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
-
-                            for (size_t oc = 0; oc < dst.c; ++oc) {
-                                // acc here.
-                                out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]);
+    Shape in_shape, Shape out_shape, const size_t filter_height, const size_t filter_width, const VEC_F16& feature_map,
+    const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, float16_t clamp_max) {
+    for (size_t n = 0; n < out_shape.n; ++n) {
+        for (size_t out_h = 0; out_h < out_shape.h; ++out_h) {
+            for (size_t out_w = 0; out_w < out_shape.w; ++out_w) {
+                // Apply filter to feature map.
+                for (size_t kernel_h = 0; kernel_h < filter_height; ++kernel_h) {
+                    if (in_shape.h <= (out_h + kernel_h)) continue;
+                    for (size_t kernel_w = 0; kernel_w < filter_width; ++kernel_w) {
+                        if (in_shape.w <= (out_w + kernel_w)) continue;
+
+                        for (size_t ic = 0; ic < in_shape.c; ++ic) {
+                            auto in_idx =
+                                ((n * in_shape.h + (out_h + kernel_h)) * in_shape.w + (out_w + kernel_w)) * in_shape.c +
+                                ic;
+                            auto weights_idx = (((kernel_h * filter_width + kernel_w) * in_shape.c + ic) * out_shape.c);
+                            auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c;
+
+                            for (size_t oc = 0; oc < out_shape.c; ++oc) {
+                                // Perform actual accumulation and store in output vector
+                                out[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]);
                             }
                         }
                     }
                 }
 
-                // Accumulate bias here.
-                for (size_t oc = 0; oc < dst.c; ++oc) {
-                    // acc here.
-                    auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c;
+                // Perform bias accumulation for channel idx and store in output vector.
+                for (size_t oc = 0; oc < out_shape.c; ++oc) {
+                    auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c;
                     out[out_idx + oc] += bias[oc];
                 }
             }
         }
     }
 
-    // Loop and clamp output data.
-    for (size_t i = 0; i < dst.size(); i++) {
+    // Apply clamping in-place to output of covolution.
+    for (size_t i = 0; i < out_shape.size(); i++) {
         out[i] = std::clamp(out[i], clamp_min, clamp_max);
     }
 }
 
+/// Fill a provided indirection table according to tensor shape parameters.
+/// @param[in] feature_map Input feature map tensor
+/// @param[out] indirection_table Indirection buffer to fill in place.
+/// @param[in] pad_buffer Pointer to start of padding.
+/// @param[in] in_shape Shape of input tensor [N,H,W,C] format.
+/// @param[in] out_shape Shape of output tensor [N,H,W,C] format.
+/// @param[in] filter_height Height of convolution filter.
+/// @param[in] filter_width Width of convolution filter.
+/// @param[in] itable_cols Number of columns in indirection table (m_step)
+std::vector<float16_t*> init_indirection_table(
+    VEC_F16& feature_map, std::vector<float16_t*>& indirect_table, float16_t* pad_buffer, const Shape& in_shape,
+    const Shape& out_shape, const size_t filter_height, const size_t filter_width, const size_t itable_cols) {
+    // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step.
+    // Number of blocks is = round_up_division(M, m_step)
+    const size_t block_size = filter_height * filter_width * itable_cols;
+    const size_t in_hwc_size = in_shape.h * in_shape.w * in_shape.c;
+
+    // The following code iterates over the first 3 dims of the output tensor and retrieves KH*KW number of pointers to
+    // the input matrix for each idx. These pointers are stored columnwise in the itable, beginning with an offset.
+    for (size_t batch_idx = 0; batch_idx < out_shape.n; batch_idx++) {
+        for (size_t output_y = 0; output_y < out_shape.h; output_y++) {
+            for (size_t output_x = 0; output_x < out_shape.w; output_x++) {
+                // Calculates column and row offsets for itable index with respect to current block location and itable
+                // column length (equivalent to m_step) The block start x/y offsets ensure the data is padded in the
+                // format expected by the LHS Packing kernel.
+                size_t block_start_x =
+                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) % itable_cols);
+                size_t block_start_y =
+                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) / itable_cols);
+                for (size_t kernel_y = 0; kernel_y < filter_height; kernel_y++) {
+                    const size_t input_y = output_y + kernel_y;
+                    if (input_y < in_shape.h) {
+                        for (size_t kernel_x = 0; kernel_x < filter_width; kernel_x++) {
+                            size_t input_x = output_x + kernel_x;
+                            size_t kernel_index = kernel_y * filter_width + kernel_x;
+                            size_t index = (block_start_y * block_size) + block_start_x + kernel_index * itable_cols;
+
+                            if (input_x < in_shape.w) {
+                                indirect_table[index] =
+                                    (feature_map.data() + batch_idx * in_hwc_size + input_y * in_shape.w * in_shape.c +
+                                     input_x * in_shape.c);
+                            } else {
+                                indirect_table[index] = pad_buffer;
+                            }
+                        }
+                    } else {
+                        for (size_t kernel_x = 0; kernel_x < filter_width; kernel_x++) {
+                            size_t kernel_index = kernel_y * filter_width + kernel_x;
+                            size_t index = (block_start_y * block_size) + block_start_x + kernel_index * itable_cols;
+                            indirect_table[index] = pad_buffer;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return indirect_table;
+}
+
 /// Fills the matrix with incremental values
 void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) {
     for (size_t i = 0; i < size; i++) {
@@ -124,20 +169,29 @@ void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) {
     }
 }
 
-/// Print the matrix
+#ifdef KAI_DEBUG
+/// Function prints a matrix according to the rows and columns specified.
+/// @param[in] num_rows Number of rows in the matrix.
+/// @param[in] num_cols Number of columns in the matrix.
+/// @param[in] name The name of the matrix to be printed. This will be included in the output.
+/// @param[in] src A vector of F16 elements representing the matrix.
 void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) {
-    std::cout << name << " = [\n";
-    for (size_t y = 0; y < num_rows; ++y) {
+    std::cout << "\n" << name << " = [\n";
+    for (size_t row = 0; row < num_rows; ++row) {
         std::cout << "  [";
-        for (size_t x = 0; x < num_cols; ++x) {
-            std::cout << std::setprecision(0) << std::fixed << src[y * num_cols + x] << ", ";
+        for (size_t col = 0; col < num_cols; ++col) {
+            std::cout << std::setprecision(0) << std::fixed << src[row * num_cols + col] << ", ";
         }
         std::cout << ("],\n");
     }
     std::cout << ("]\n\n");
 }
 
-/// Print the matrix
+/// Function prints a tensor in NHWC format.
+/// Width and channels are printed on the same line. Square brackets are used to denote dimensions.
+/// @param[in] shape A struct containing the NHWC shape of the tensor.
+/// @param[in] name Name of the tensor
+/// @param[in] src A vector of F16 elements representing the tensor.
 void print_tensor(Shape shape, const char* name, const VEC_F16& src) {
     std::cout << name << " = [\n";
     for (size_t n = 0; n < shape.n; n++) {
@@ -148,7 +202,7 @@ void print_tensor(Shape shape, const char* name, const VEC_F16& src) {
                 std::cout << "[";
                 for (size_t c = 0; c < shape.c; c++) {
                     if (c != 0) std::cout << " , ";
-                    std::cout << std::setprecision(0) << std::fixed
+                    std::cout << std::setprecision(1) << std::fixed
                               << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c];
                 }
                 std::cout << "] ";
@@ -158,6 +212,7 @@ void print_tensor(Shape shape, const char* name, const VEC_F16& src) {
     }
     std::cout << ("]\n\n");
 }
+#endif  // KAI_DEBUG
 
 // Verify the micro-kernel output matches the reference implementation
 bool is_output_correct(
@@ -178,145 +233,101 @@ bool is_output_correct(
     std::cout << "\n\nThere are " << count << " mismatches." << std::endl;
     return is_valid;
 }
-}  // namespace
 
 size_t round_up_division(size_t a, size_t b) {
     return (a + b - 1) / b;
 }
+}  // namespace
 
 int main() {
-    // Input tensor in {NHWC} format.
-    Shape input{5, 32, 32, 3};    // Layout : [N, H, W, C]
-    Shape kernel{1, 3, 3, 3, 2};  //  Layout : [1, KH, KW, CI, CO]
-    Shape output{5, 30, 30, 2};   // Layout : [N, H, W, C]
-
-    // This example only supports default conv arguments.
-    const ConvInfo conv_info;
-
-    // Init Input buffers.
-    VEC_F16 in(input.size());
-    VEC_F16 filter(kernel.size());
-    VEC_F16 bias(output.c);
-    fill_matrix(in.size(), in, 0.1f);
-    fill_matrix(filter.size(), filter, 0.01f);
+    // Arguments for convolution operation.
+    // Padding must be valid
+    const size_t batch_size = 5;
+    const size_t input_height = 32;
+    const size_t input_width = 32;
+    const size_t input_channels = 3;
+    const size_t filter_height = 3;
+    const size_t filter_width = 3;
+    const size_t out_channels = 2;
+
+    // Use shape arguments to define tensor shapes in NHWC Format.
+    const Shape in_shape{batch_size, input_height, input_width, input_channels};
+    const Shape weights_shape{filter_height, filter_width, input_channels, out_channels};
+    const Shape out_shape{
+        batch_size, (input_height - filter_height + 1), (input_width - filter_width + 1), out_channels};
+
+    // Define and Fill Input Tensors for operation using shapes
+    VEC_F16 feature_map(in_shape.size());
+    VEC_F16 weights(weights_shape.size());
+    VEC_F16 bias(out_channels);
+
+    // Fill by iterating over in 1D and multiplying idx by the weight supplied as argument.
+    fill_matrix(feature_map.size(), feature_map, 0.1f);
+    fill_matrix(weights.size(), weights, 0.2f);
     fill_matrix(bias.size(), bias, 1.f);
 
-    // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1)
-    // N - Equivalent to output.c
-    // K - Width of LHS after im2col - is equivalent to (input_shape.c * kernel.w * kernel.w) when num_groups = 0 and no
-    // padding
-    const size_t M = input.n * ((input.w - kernel.w) + 1) * ((input.h - kernel.h) + 1);
-    const size_t K = (input.c * kernel.w * kernel.h);
-    const size_t N = kernel.size() / K;
-
-    // Chunking is done in channel dimension (lowest input dim)
-    const size_t k_chunk_length = input.c;
-    const size_t k_chunk_count = K / k_chunk_length;
-
-    // Check all shapes are valid.
-    assert(output.size() == M * N);
-    assert(input.n == output.n);
-    assert(kernel.c == input.c && kernel.co == output.c);
-    assert(kernel.n == 1);
-
-    // ------------------------------------------
-    // 1. Pack LHS - Create Indirection buffer.
-    // ------------------------------------------
-
-    const size_t m_step = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme();
-    const size_t itable_rows = k_chunk_count * round_up_division(M, m_step);
-    const size_t itable_cols = m_step;
-
-    const size_t out_hw_size = output.h * output.w;
-    const size_t in_hwc_size = input.size() / input.n;
-
-    float16_t* zero_buffer = in.data();
-
-    // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step.
-    // Number of blocks is = round_up_division(M, m_step)
-    const size_t block_size = k_chunk_count * m_step;
-    std::vector<float16_t*> indirect_table(itable_cols * itable_rows, zero_buffer);
-
-    for (size_t batch_idx = 0; batch_idx < output.n; batch_idx++) {
-        // We iterate over OH * OW dims and retrieve a pointer to relevant input index.
-        for (size_t out_idx = 0; out_idx < out_hw_size; out_idx++) {
-            const size_t output_x = out_idx % output.w;
-            const size_t output_y = out_idx / output.w;
-
-            // Calculates column and row offsets for itable index with respect to block
-            size_t block_start_x = (((batch_idx * out_hw_size) + out_idx) % m_step);
-            size_t block_start_y = (((batch_idx * out_hw_size) + out_idx) / m_step);
-
-            // These filter loops will fill the indirection table column-wise for kh*kw elements.
-            for (size_t kernel_y = 0; kernel_y < kernel.h; kernel_y++) {
-                const size_t input_y =
-                    output_y * conv_info.stride_y + kernel_y * conv_info.dilation_y - conv_info.pad_top;
-                if (input_y < input.h) {
-                    for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) {
-                        size_t input_x =
-                            output_x * conv_info.stride_x + kernel_x * conv_info.dilation_x - conv_info.pad_left;
-                        size_t kernel_index = kernel_y * kernel.w + kernel_x;
-
-                        size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step;
-
-                        if (input_x < input.w) {
-                            indirect_table[index] =
-                                (in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + input_x * input.c);
-                        } else {
-                            indirect_table[index] = zero_buffer;
-                        }
-                    }
-                } else {
-                    for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) {
-                        size_t kernel_index = kernel_y * kernel.w + kernel_x;
-                        size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step;
-                        indirect_table[index] = zero_buffer;
-                    }
-                }
-            }
-        }
-    }
-
-    // Init Output buffers.
-    VEC_F16 act_output(output.size());
-    VEC_F16 ref_output(output.size());
+    // The following are used as parameters in the indirection kernels
+    const size_t out_nhw_size = out_shape.n * out_shape.h * out_shape.w;
+    const size_t k_chunk_length = input_channels;
+    const size_t k_chunk_count = filter_height * filter_width;
 
     // -------------------------------------------------
-    // 1b. Pack LHS and RHS.
+    // 1. Create Indirection buffer.
     // -------------------------------------------------
+    // Define and Fill the indirection table in the format expected of the LHS Indirection Matmul kernel.
+    // NOTE: out_nhw_size is equivalent to M argument for Indirection kernels.
+    //       out_channels is equivalent to N argument for Indirection kernels.
+    const size_t itable_cols = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme();
+    const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols);
+    std::vector<float16_t*> indirect_table(itable_cols * itable_rows);
+
+    // Start of input feature map is passed as padding pointer, this is not neccessary.
+    float16_t* pad_buffer = feature_map.data();
+    init_indirection_table(
+        feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols);
 
-    // Initialise LHS Packed buffer and call packing kernel.
+    // -------------------------------------------------
+    // 2. Pack LHS and RHS.
+    // -------------------------------------------------
     auto lhs_packed_size =
-        kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(M, k_chunk_count, k_chunk_length);
-    auto rhs_packed_size =
-        kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(N, k_chunk_count, k_chunk_length);
+        kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(out_nhw_size, k_chunk_count, k_chunk_length);
+    auto rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(
+        out_channels, k_chunk_count, k_chunk_length);
 
     VEC_F16 packed_lhs(lhs_packed_size);
     VEC_F16 packed_rhs(rhs_packed_size);
 
+    // Padding is not used in the indirection buffer (as padding is valid), therefore pad_ptr is nullptr
+    // Ptr offset is provided as 0 as it is not needed to apply an offset to each valid pointer provided in the table in
+    // this case.
     kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme(
-        M, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr, packed_lhs.data());
+        out_nhw_size, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr,
+        packed_lhs.data());
     kai_run_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(
-        N, k_chunk_count, k_chunk_length, N * sizeof(float16_t), filter.data(), bias.data(), packed_rhs.data());
+        out_channels, k_chunk_count, k_chunk_length, out_channels * sizeof(float16_t), weights.data(), bias.data(),
+        packed_rhs.data());
 
     // -------------------------------------------------
-    // 2. Perform matmul operation.
+    // 3. Perform matmul operation and call reference, then compare.
     // -------------------------------------------------
+    VEC_F16 act_output(out_shape.size());
+    VEC_F16 ref_output(out_shape.size());
 
     kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa(
-        M, N, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(), act_output.data(),
-        N * sizeof(float16_t), clamp_min, clamp_max);
+        out_nhw_size, out_channels, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(),
+        act_output.data(), out_channels * sizeof(float16_t), clamp_min, clamp_max);
 
-    // -------------------------------------------------
-    // 3. Call reference and compare output.
-    // -------------------------------------------------
-    convolution_layer_nhwc(input, kernel, output, in, filter, bias, ref_output, clamp_min, clamp_max, conv_info);
+    convolution_layer_nhwc(
+        in_shape, out_shape, filter_height, filter_width, feature_map, weights, bias, ref_output, clamp_min, clamp_max);
 
 #ifdef KAI_DEBUG
+    std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape
+              << " Output Shape : " << out_shape << std::endl;
     print_tensor(output, "\nTarget : ", act_output);
     print_tensor(output, "\nREf : ", ref_output);
 #endif  // KAI_DEBUG
-    is_output_correct(M, N, 0.01f, ref_output, act_output);
+
+    is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output);
 
     return 0;
 }
-- 
GitLab


From 2d41918e3a43b888e6f1569f05534479874476cb Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Wed, 28 May 2025 11:55:57 +0100
Subject: [PATCH 06/18] Minor changes to improve comment documation

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp         | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 2dec00dd..bed277bf 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -19,6 +19,7 @@
 #include <iomanip>
 #include <iostream>
 #include <sstream>
+#include <vector>
 
 // Include micro-kernel variants
 #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
@@ -61,8 +62,9 @@ struct Shape {
 /// @param[in] clamp_min Minimum value to clamp final result
 /// @param[in] clamp_max Max value to clamp final result
 void convolution_layer_nhwc(
-    Shape in_shape, Shape out_shape, const size_t filter_height, const size_t filter_width, const VEC_F16& feature_map,
-    const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, float16_t clamp_max) {
+    const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width,
+    const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min,
+    float16_t clamp_max) {
     for (size_t n = 0; n < out_shape.n; ++n) {
         for (size_t out_h = 0; out_h < out_shape.h; ++out_h) {
             for (size_t out_w = 0; out_w < out_shape.w; ++out_w) {
@@ -162,7 +164,10 @@ std::vector<float16_t*> init_indirection_table(
     return indirect_table;
 }
 
-/// Fills the matrix with incremental values
+/// Fills the matrix with incremental values according to the provided weight.
+/// @param[in] size Total number of elements to fill in passed vector;.
+/// @param[in] dst Vector representing a tensor to fill.
+/// @param[in] weight A weight value to increment by.
 void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) {
     for (size_t i = 0; i < size; i++) {
         dst[i] = float16_t(i * weight);
@@ -192,7 +197,7 @@ void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_
 /// @param[in] shape A struct containing the NHWC shape of the tensor.
 /// @param[in] name Name of the tensor
 /// @param[in] src A vector of F16 elements representing the tensor.
-void print_tensor(Shape shape, const char* name, const VEC_F16& src) {
+void print_tensor(const Shape& shape, const char* name, const VEC_F16& src) {
     std::cout << name << " = [\n";
     for (size_t n = 0; n < shape.n; n++) {
         std::cout << "\n";
@@ -261,7 +266,7 @@ int main() {
     VEC_F16 weights(weights_shape.size());
     VEC_F16 bias(out_channels);
 
-    // Fill by iterating over in 1D and multiplying idx by the weight supplied as argument.
+    // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0.
     fill_matrix(feature_map.size(), feature_map, 0.1f);
     fill_matrix(weights.size(), weights, 0.2f);
     fill_matrix(bias.size(), bias, 1.f);
-- 
GitLab


From c594b7002bb2ee18957f9c8adfca85d4064d4c07 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Wed, 28 May 2025 12:12:54 +0100
Subject: [PATCH 07/18] Bug fixes

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 44 +++++++------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index bed277bf..096d591f 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -63,8 +63,8 @@ struct Shape {
 /// @param[in] clamp_max Max value to clamp final result
 void convolution_layer_nhwc(
     const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width,
-    const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min,
-    float16_t clamp_max) {
+    const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float clamp_min,
+    float clamp_max) {
     for (size_t n = 0; n < out_shape.n; ++n) {
         for (size_t out_h = 0; out_h < out_shape.h; ++out_h) {
             for (size_t out_w = 0; out_w < out_shape.w; ++out_w) {
@@ -100,7 +100,7 @@ void convolution_layer_nhwc(
 
     // Apply clamping in-place to output of covolution.
     for (size_t i = 0; i < out_shape.size(); i++) {
-        out[i] = std::clamp(out[i], clamp_min, clamp_max);
+        out[i] = std::clamp(out[i], static_cast<float16_t>(clamp_min), static_cast<float16_t>(clamp_max));
     }
 }
 
@@ -130,9 +130,9 @@ std::vector<float16_t*> init_indirection_table(
                 // column length (equivalent to m_step) The block start x/y offsets ensure the data is padded in the
                 // format expected by the LHS Packing kernel.
                 size_t block_start_x =
-                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) % itable_cols);
+                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.w + output_x)) % itable_cols);
                 size_t block_start_y =
-                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) / itable_cols);
+                    (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.w + output_x)) / itable_cols);
                 for (size_t kernel_y = 0; kernel_y < filter_height; kernel_y++) {
                     const size_t input_y = output_y + kernel_y;
                     if (input_y < in_shape.h) {
@@ -175,23 +175,6 @@ void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) {
 }
 
 #ifdef KAI_DEBUG
-/// Function prints a matrix according to the rows and columns specified.
-/// @param[in] num_rows Number of rows in the matrix.
-/// @param[in] num_cols Number of columns in the matrix.
-/// @param[in] name The name of the matrix to be printed. This will be included in the output.
-/// @param[in] src A vector of F16 elements representing the matrix.
-void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) {
-    std::cout << "\n" << name << " = [\n";
-    for (size_t row = 0; row < num_rows; ++row) {
-        std::cout << "  [";
-        for (size_t col = 0; col < num_cols; ++col) {
-            std::cout << std::setprecision(0) << std::fixed << src[row * num_cols + col] << ", ";
-        }
-        std::cout << ("],\n");
-    }
-    std::cout << ("]\n\n");
-}
-
 /// Function prints a tensor in NHWC format.
 /// Width and channels are printed on the same line. Square brackets are used to denote dimensions.
 /// @param[in] shape A struct containing the NHWC shape of the tensor.
@@ -251,8 +234,8 @@ int main() {
     const size_t input_height = 32;
     const size_t input_width = 32;
     const size_t input_channels = 3;
-    const size_t filter_height = 3;
-    const size_t filter_width = 3;
+    const size_t filter_height = 5;
+    const size_t filter_width = 2;
     const size_t out_channels = 2;
 
     // Use shape arguments to define tensor shapes in NHWC Format.
@@ -261,6 +244,11 @@ int main() {
     const Shape out_shape{
         batch_size, (input_height - filter_height + 1), (input_width - filter_width + 1), out_channels};
 
+#ifdef KAI_DEBUG
+    std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape
+              << " Output Shape : " << out_shape << std::endl;
+#endif  // KAI_DEBUG
+
     // Define and Fill Input Tensors for operation using shapes
     VEC_F16 feature_map(in_shape.size());
     VEC_F16 weights(weights_shape.size());
@@ -268,7 +256,7 @@ int main() {
 
     // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0.
     fill_matrix(feature_map.size(), feature_map, 0.1f);
-    fill_matrix(weights.size(), weights, 0.2f);
+    fill_matrix(weights.size(), weights, 0.1f);
     fill_matrix(bias.size(), bias, 1.f);
 
     // The following are used as parameters in the indirection kernels
@@ -326,10 +314,8 @@ int main() {
         in_shape, out_shape, filter_height, filter_width, feature_map, weights, bias, ref_output, clamp_min, clamp_max);
 
 #ifdef KAI_DEBUG
-    std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape
-              << " Output Shape : " << out_shape << std::endl;
-    print_tensor(output, "\nTarget : ", act_output);
-    print_tensor(output, "\nREf : ", ref_output);
+    print_tensor(out_shape, "\nTarget : ", act_output);
+    print_tensor(out_shape, "\nREf : ", ref_output);
 #endif  // KAI_DEBUG
 
     is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output);
-- 
GitLab


From 29d3ed28743f4b9556a51a42a9a40eb75b4c5c1b Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Wed, 28 May 2025 14:09:15 +0100
Subject: [PATCH 08/18] Use FP32 Accumulator in Conv2D reference

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp         | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 096d591f..c2561b4a 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -65,6 +65,9 @@ void convolution_layer_nhwc(
     const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width,
     const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float clamp_min,
     float clamp_max) {
+    // We accumulate in FP32 and clamp later.
+    std::vector<float> acc(out_shape.size());
+
     for (size_t n = 0; n < out_shape.n; ++n) {
         for (size_t out_h = 0; out_h < out_shape.h; ++out_h) {
             for (size_t out_w = 0; out_w < out_shape.w; ++out_w) {
@@ -83,7 +86,7 @@ void convolution_layer_nhwc(
 
                             for (size_t oc = 0; oc < out_shape.c; ++oc) {
                                 // Perform actual accumulation and store in output vector
-                                out[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]);
+                                acc[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]);
                             }
                         }
                     }
@@ -92,15 +95,15 @@ void convolution_layer_nhwc(
                 // Perform bias accumulation for channel idx and store in output vector.
                 for (size_t oc = 0; oc < out_shape.c; ++oc) {
                     auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c;
-                    out[out_idx + oc] += bias[oc];
+                    acc[out_idx + oc] += bias[oc];
                 }
             }
         }
     }
 
-    // Apply clamping in-place to output of covolution.
+    // Apply clamping to accumulator, cast to FP16 and store in output vector at the same idx.
     for (size_t i = 0; i < out_shape.size(); i++) {
-        out[i] = std::clamp(out[i], static_cast<float16_t>(clamp_min), static_cast<float16_t>(clamp_max));
+        out[i] = static_cast<float16_t>(std::clamp(acc[i], clamp_min, clamp_max));
     }
 }
 
@@ -256,7 +259,7 @@ int main() {
 
     // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0.
     fill_matrix(feature_map.size(), feature_map, 0.1f);
-    fill_matrix(weights.size(), weights, 0.1f);
+    fill_matrix(weights.size(), weights, 0.01f);
     fill_matrix(bias.size(), bias, 1.f);
 
     // The following are used as parameters in the indirection kernels
@@ -318,7 +321,7 @@ int main() {
     print_tensor(out_shape, "\nREf : ", ref_output);
 #endif  // KAI_DEBUG
 
-    is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output);
+    is_output_correct(out_nhw_size, out_channels, 0.0001f, ref_output, act_output);
 
     return 0;
 }
-- 
GitLab


From c943fdbebc6bfe5666934c6e73c0a2a4ef546beb Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Thu, 29 May 2025 12:32:19 +0100
Subject: [PATCH 09/18] Add missing header from standard library.

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp                        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index c2561b4a..7c0191b2 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
-- 
GitLab


From 6f41537e2ab93374fa7ebb0f1645e80ad1e6fcee Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Thu, 29 May 2025 15:31:36 +0100
Subject: [PATCH 10/18] Add HW check for SME2 Extension

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 7c0191b2..06fbc67c 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -22,6 +22,14 @@
 #include <sstream>
 #include <vector>
 
+#if defined(__linux__)
+#include <sys/auxv.h>
+#endif  // defined(__linux__)
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif  // defined(__APPLE__)
+
 // Include micro-kernel variants
 #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h"
@@ -229,9 +237,33 @@ bool is_output_correct(
 size_t round_up_division(size_t a, size_t b) {
     return (a + b - 1) / b;
 }
+
+// Check for hardware support for SME2 (required by IMatmul kernels).
+bool has_sme2_support() {
+#if defined(__linux__)
+#ifndef HWCAP2_SME2
+    constexpr uint64_t HWCAP2_SME2 = 1UL << 37;
+#endif  // HWCAP2_SME2
+    unsigned long hwcaps = getauxval(AT_HWCAP2);
+    if (hwcaps & HWCAP2_SME2) return true;
+#elif defined(__APPLE__)
+    uint32_t value{};
+    size_t len = sizeof(value);
+    if (sysctlbyname("hw.optional.arm.FEAT_SME2", &value, &len, NULL, 0) != 0) return false;
+    return value;
+#endif  // OS check
+    return false;
+}
+
 }  // namespace
 
 int main() {
+    // Check for SME support and skip tests if not supported.
+    if (!has_sme2_support()) {
+        printf("\nThis example requires support for the SME2 CPU extension.\n");
+        return 0;
+    }
+
     // Arguments for convolution operation.
     // Padding must be valid
     const size_t batch_size = 5;
-- 
GitLab


From aa36d2de50cf3b337f73df997db439281de3a0ae Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Fri, 30 May 2025 14:39:59 +0100
Subject: [PATCH 11/18] Remove hardware check and prevent sme example from
 running in tests.

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .gitlab-ci.yml                                |  1 +
 .../CMakeLists.txt                            |  0
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp     | 32 -------------------
 3 files changed, 1 insertion(+), 32 deletions(-)
 rename examples/{conv2d_imatmul_clamp_f16_f16_f16p => conv2d_imatmul_clamp_f16_f16_f16p_sme2}/CMakeLists.txt (100%)
 rename examples/{conv2d_imatmul_clamp_f16_f16_f16p => conv2d_imatmul_clamp_f16_f16_f16p_sme2}/conv2d_imatmul_clamp_f16_f16_f16p.cpp (94%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2f2d56d8..23442725 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -122,6 +122,7 @@ build-examples:
     - >
       for EXAMPLE in `ls examples -1`; do
         if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then
+          [[ $EXAMPLE == *sme* ]] && continue
           echo "-----------------------------------------------------------"
           echo "Build examples/${EXAMPLE}"
           echo "-----------------------------------------------------------"
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt
similarity index 100%
rename from examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt
rename to examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
similarity index 94%
rename from examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
rename to examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 06fbc67c..7c0191b2 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -22,14 +22,6 @@
 #include <sstream>
 #include <vector>
 
-#if defined(__linux__)
-#include <sys/auxv.h>
-#endif  // defined(__linux__)
-
-#if defined(__APPLE__)
-#include <sys/sysctl.h>
-#endif  // defined(__APPLE__)
-
 // Include micro-kernel variants
 #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h"
@@ -237,33 +229,9 @@ bool is_output_correct(
 size_t round_up_division(size_t a, size_t b) {
     return (a + b - 1) / b;
 }
-
-// Check for hardware support for SME2 (required by IMatmul kernels).
-bool has_sme2_support() {
-#if defined(__linux__)
-#ifndef HWCAP2_SME2
-    constexpr uint64_t HWCAP2_SME2 = 1UL << 37;
-#endif  // HWCAP2_SME2
-    unsigned long hwcaps = getauxval(AT_HWCAP2);
-    if (hwcaps & HWCAP2_SME2) return true;
-#elif defined(__APPLE__)
-    uint32_t value{};
-    size_t len = sizeof(value);
-    if (sysctlbyname("hw.optional.arm.FEAT_SME2", &value, &len, NULL, 0) != 0) return false;
-    return value;
-#endif  // OS check
-    return false;
-}
-
 }  // namespace
 
 int main() {
-    // Check for SME support and skip tests if not supported.
-    if (!has_sme2_support()) {
-        printf("\nThis example requires support for the SME2 CPU extension.\n");
-        return 0;
-    }
-
     // Arguments for convolution operation.
     // Padding must be valid
     const size_t batch_size = 5;
-- 
GitLab


From 325b4dfb8742c10f044f32b788b869c79384a391 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Mon, 2 Jun 2025 11:45:39 +0100
Subject: [PATCH 12/18] Make suggested changes

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp         | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 7c0191b2..e5028667 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -31,8 +31,8 @@ using VEC_F16 = std::vector<float16_t>;
 
 namespace {
 
-constexpr float clamp_min = -65504.0F;
-constexpr float clamp_max = 65504.0F;
+constexpr float clamp_min = -9000.0F;
+constexpr float clamp_max = 9000.0F;
 
 struct Shape {
     size_t n;
@@ -117,9 +117,10 @@ void convolution_layer_nhwc(
 /// @param[in] filter_height Height of convolution filter.
 /// @param[in] filter_width Width of convolution filter.
 /// @param[in] itable_cols Number of columns in indirection table (m_step)
-std::vector<float16_t*> init_indirection_table(
-    VEC_F16& feature_map, std::vector<float16_t*>& indirect_table, float16_t* pad_buffer, const Shape& in_shape,
-    const Shape& out_shape, const size_t filter_height, const size_t filter_width, const size_t itable_cols) {
+std::vector<const float16_t*> init_indirection_table(
+    const VEC_F16& feature_map, std::vector<const float16_t*>& indirect_table, const float16_t* pad_buffer,
+    const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width,
+    const size_t itable_cols) {
     // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step.
     // Number of blocks is = round_up_division(M, m_step)
     const size_t block_size = filter_height * filter_width * itable_cols;
@@ -276,10 +277,10 @@ int main() {
     //       out_channels is equivalent to N argument for Indirection kernels.
     const size_t itable_cols = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme();
     const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols);
-    std::vector<float16_t*> indirect_table(itable_cols * itable_rows);
+    std::vector<const float16_t*> indirect_table(itable_cols * itable_rows);
 
     // Start of input feature map is passed as padding pointer, this is not neccessary.
-    float16_t* pad_buffer = feature_map.data();
+    const float16_t* pad_buffer = feature_map.data();
     init_indirection_table(
         feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols);
 
-- 
GitLab


From 675e27fc3192fa289de6e34cf3617faca21dc73b Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Mon, 2 Jun 2025 11:48:13 +0100
Subject: [PATCH 13/18] Add changelog note

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3845ca8e..005bbaff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo
 - Fixes:
   - Address segmentation faults in benchmarking tool.
   - Fix clamping issues for FP16 and BF16 in testing framework.
+- Added Convolution example using SME Indirect Matmul Kernels
 
 ## v1.8.0
 
-- 
GitLab


From a5bf700b860c14a373d3edd5b65affb5df665ce0 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Mon, 2 Jun 2025 14:25:44 +0100
Subject: [PATCH 14/18] Change return type of function to void

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index e5028667..28274282 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -117,7 +117,7 @@ void convolution_layer_nhwc(
 /// @param[in] filter_height Height of convolution filter.
 /// @param[in] filter_width Width of convolution filter.
 /// @param[in] itable_cols Number of columns in indirection table (m_step)
-std::vector<const float16_t*> init_indirection_table(
+void init_indirection_table(
     const VEC_F16& feature_map, std::vector<const float16_t*>& indirect_table, const float16_t* pad_buffer,
     const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width,
     const size_t itable_cols) {
-- 
GitLab


From 24b5221df1f107ada7e145188a4df1396c117de2 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 3 Jun 2025 10:29:25 +0100
Subject: [PATCH 15/18] Remove padding buffer as it is not used.

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 CHANGELOG.md                                           |  3 ++-
 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp              | 10 ++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 005bbaff..8cb651ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo
 
 ## Upcoming Release
 
+- Added Convolution example using SME Indirect Matmul Kernels
+
 ## v1.9.0
 
 - Extend support for signed 4-bit integer inputs in `kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon`.
@@ -21,7 +23,6 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo
 - Fixes:
   - Address segmentation faults in benchmarking tool.
   - Fix clamping issues for FP16 and BF16 in testing framework.
-- Added Convolution example using SME Indirect Matmul Kernels
 
 ## v1.8.0
 
diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
index 28274282..bf47c80d 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp
@@ -165,8 +165,6 @@ void init_indirection_table(
             }
         }
     }
-
-    return indirect_table;
 }
 
 /// Fills the matrix with incremental values according to the provided weight.
@@ -279,10 +277,10 @@ int main() {
     const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols);
     std::vector<const float16_t*> indirect_table(itable_cols * itable_rows);
 
-    // Start of input feature map is passed as padding pointer, this is not neccessary.
-    const float16_t* pad_buffer = feature_map.data();
+    // Padding buffer 'pad_buffer' is set to nullptr as there is no padding in this example.
+    // Shapes specified are such that no padding should be needed.
     init_indirection_table(
-        feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols);
+        feature_map, indirect_table, nullptr, in_shape, out_shape, filter_height, filter_width, itable_cols);
 
     // -------------------------------------------------
     // 2. Pack LHS and RHS.
@@ -295,7 +293,7 @@ int main() {
     VEC_F16 packed_lhs(lhs_packed_size);
     VEC_F16 packed_rhs(rhs_packed_size);
 
-    // Padding is not used in the indirection buffer (as padding is valid), therefore pad_ptr is nullptr
+    // Padding is not used in the indirection buffer, therefore pad_ptr is nullptr
     // Ptr offset is provided as 0 as it is not needed to apply an offset to each valid pointer provided in the table in
     // this case.
     kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme(
-- 
GitLab


From 601ee7faa0c83537613075414bb0e7bbc9afc107 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 3 Jun 2025 12:43:16 +0100
Subject: [PATCH 16/18] Enable building but not running SME Examples in CI

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .gitlab-ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 23442725..9a0ab422 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -119,16 +119,17 @@ build-examples:
     - .standard-rules
   script:
     - mkdir -p build
+    # Examples are built, but SME examples are not copied to prevent running on unsupported systems.
     - >
       for EXAMPLE in `ls examples -1`; do
         if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then
-          [[ $EXAMPLE == *sme* ]] && continue
           echo "-----------------------------------------------------------"
           echo "Build examples/${EXAMPLE}"
           echo "-----------------------------------------------------------"
           mkdir -p build_${EXAMPLE}
           cmake -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_C_FLAGS="-Werror" -DCMAKE_BUILD_TYPE=Release -S examples/$EXAMPLE -B build_${EXAMPLE}
           cmake --build build_${EXAMPLE} -j${PARALLEL_JOBS} --verbose
+          [[ $EXAMPLE == *sme* ]] && continue
           cp build_${EXAMPLE}/${EXAMPLE} build/
         else
           echo "No build file found for ${EXAMPLE}"
-- 
GitLab


From 13af4f8b601c042edb5ac5e0607acb334ce6aefd Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 3 Jun 2025 13:31:51 +0100
Subject: [PATCH 17/18] Minor change

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .gitlab-ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9a0ab422..9ca03ac8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -119,7 +119,6 @@ build-examples:
     - .standard-rules
   script:
     - mkdir -p build
-    # Examples are built, but SME examples are not copied to prevent running on unsupported systems.
     - >
       for EXAMPLE in `ls examples -1`; do
         if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then
@@ -129,7 +128,6 @@ build-examples:
           mkdir -p build_${EXAMPLE}
           cmake -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_C_FLAGS="-Werror" -DCMAKE_BUILD_TYPE=Release -S examples/$EXAMPLE -B build_${EXAMPLE}
           cmake --build build_${EXAMPLE} -j${PARALLEL_JOBS} --verbose
-          [[ $EXAMPLE == *sme* ]] && continue
           cp build_${EXAMPLE}/${EXAMPLE} build/
         else
           echo "No build file found for ${EXAMPLE}"
@@ -153,6 +151,7 @@ test-examples:
           echo "-----------------------------------------------------------"
           echo "Run ${EXAMPLE}"
           echo "-----------------------------------------------------------"
+          [[ $EXAMPLE == *sme* ]] && continue
           build/${EXAMPLE} | tee -a example_${EXAMPLE}.log
       done
   artifacts:
-- 
GitLab


From aed6c82581a9d394ff7ded35be6e971d1ec80cf7 Mon Sep 17 00:00:00 2001
From: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Date: Tue, 3 Jun 2025 13:49:18 +0100
Subject: [PATCH 18/18] Fix built binary name

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
---
 .../conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt
index aef754c9..a9499b84 100644
--- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt
+++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt
@@ -6,7 +6,7 @@
 
 cmake_minimum_required(VERSION 3.16)
 
-project(conv2d_imatmul_clamp_f16_f16_f16p)
+project(conv2d_imatmul_clamp_f16_f16_f16p_sme2)
 
 set(CMAKE_CXX_STANDARD 17)
 set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../)
@@ -21,13 +21,13 @@ set(KAI_SOURCES
 
 # Files requires to build the executable
 add_executable(
-    conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp
+    conv2d_imatmul_clamp_f16_f16_f16p_sme2 conv2d_imatmul_clamp_f16_f16_f16p.cpp
     ${KAI_SOURCES})
 
-target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p
+target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p_sme2
     PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize"
 )
 
-target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p
+target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p_sme2
     PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
 )
-- 
GitLab