From e151cd142e781d1c74043ddccc3605bfae6a566e Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 20 May 2025 12:23:35 +0100 Subject: [PATCH 01/18] Add Conv2D example using FP16 IGEMM - Example demonstrates creating an indirect buffer using a Conv2D input tensor - Example demonstrates indirect buffer usage with imatmul kernels. Signed-off-by: Mohammed Suhail Munshi --- .../CMakeLists.txt | 44 +++ .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 323 ++++++++++++++++++ 2 files changed, 367 insertions(+) create mode 100644 examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt create mode 100644 examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt new file mode 100644 index 00000000..edc50297 --- /dev/null +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt @@ -0,0 +1,44 @@ +# +# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# + +cmake_minimum_required(VERSION 3.16) + +project(matmul_clamp_f16_f16_f16p) + +set(CMAKE_CXX_STANDARD 17) +set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../) +set(KAI_BUILD ${KAI_PATH}/build) +set(CMAKE_BUILD_TYPE Debug) + +include_directories( + ${KAI_PATH}/ + ${KAI_PATH}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/ + ${KAI_PATH}/kai/ukernels/matmul/pack/) + +set(KAI_SOURCES + ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c + ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c + ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c) + +set(KAI_HEADERS + ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h + ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h + ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h) + +# Files requires to build the executable +add_executable( + conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp + ${KAI_SOURCES} + ${KAI_HEADERS} + ) + +target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p + PRIVATE -march=armv8.2-a+sve+sve2 +) + +target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p + PRIVATE $<$:KAI_DEBUG> +) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp new file mode 100644 index 00000000..b554ccb2 --- /dev/null +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -0,0 +1,323 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +// Example usage for Indirect GEMM with a convolution operation using two half-precision matrices. +// + +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2) +#error This file must be compiled for AArch64, FEAT_SVE2. +#else // Architectural features check. + +#include +#include + +#include +#include +#include + +// Include micro-kernel variants +#include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" +#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" +#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h" + +using VEC_F16 = std::vector; + +namespace { + +constexpr float clamp_min = (float)-65504; +constexpr float clamp_max = (float)65504; + +struct Shape { + int n; + int h; + int w; + int c; + int co = 1; // Used for only kernel shape. + [[nodiscard]] auto size() const -> size_t { + return n * h * w * c * co; + } + std::string print() const { + std::stringstream ss; + ss << " [ " << n << " , " << h << " ," << w << " , " << c << ", " << co << " ] "; + return ss.str(); + } +}; + +struct ConvInfo { + // Conv info - we assume the following args. + int stride_x = 1; + int stride_y = 1; + int pad_left = 0; + int pad_top = 0; + int pad_right = 0; + int pad_bottom = 0; + int dilation_x = 1; + int dilation_y = 1; +}; + +/// Perform a convolution operation in nhwc data format. +/// @param[in] src Shape of the input tensor in [N, H, W, C] DataFormat +/// @param[in] weights Shape of the weights tensor in [1, H, W, CI, CO] Format +/// @param[in] dst Shape of the output tensor in [N, H, W, C] DataFormat +/// @param[in] in half float pointer to start of input tensor +/// @param[in] wei half float pointer to start of weights tensor +/// @param[in] bias half float pointer to start of bias tensor +/// @param[out] out half float pointer to start of output tensor +/// @param[in] clamp_min Minimum value to clamp final result +/// @param[in] clamp_max Max value to clamp final result +/// @param[in] cinfo Input arguments for convolution +void convolution_layer_nhwc( + Shape src, Shape weights, Shape dst, const float16_t* in, const float16_t* wei, const float16_t* bias, + float16_t* out, float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo = ConvInfo()) { + assert(cinfo.stride_x == 1 && cinfo.stride_y == 1); + assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1); + assert(cinfo.pad_left == 0 && cinfo.pad_right == 0); + assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0); + + for (int n = 0; n < src.n; ++n) { + for (int oh = 0; oh < dst.h; ++oh) { + for (int ow = 0; ow < dst.w; ++ow) { + for (int kh = 0; kh < weights.h; ++kh) { + if (src.h <= (oh + kh)) continue; + for (int kw = 0; kw < weights.w; ++kw) { + if (src.w <= (ow + kw)) continue; + + for (int ic = 0; ic < src.c; ++ic) { + auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic; + auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c); + auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; + + for (int oc = 0; oc < dst.c; ++oc) { + // acc here. + out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]); + } + } + } + } + + // Accumulate bias here. + for (int oc = 0; oc < dst.c; ++oc) { + // acc here. + auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; + out[out_idx + oc] += bias[oc]; + } + } + } + } + + // Loop and clamp output data. + for (int i = 0; i < dst.size(); i++) { + out[i] = (out[i] < clamp_min) ? clamp_min : out[i]; + out[i] = (out[i] > clamp_max) ? clamp_max : out[i]; + } +} + +/// Fills the matrix with incremental values +void fill_matrix(size_t size, float16_t* dst, const float16_t weight) { + for (size_t i = 0; i < size; i++) { + dst[i] = float16_t(i * weight); + } +} + +/// Print the matrix +void print_matrix(size_t num_rows, size_t num_cols, const char* name, const float16_t* src) { + std::cout << name << " = [\n"; + for (size_t y = 0; y < num_rows; ++y) { + std::cout << " ["; + for (size_t x = 0; x < num_cols; ++x) { + std::cout << std::setprecision(0) << std::fixed << src[y * num_cols + x] << ", "; + } + std::cout << ("],\n"); + } + std::cout << ("]\n\n"); +} + +/// Print the matrix +void print_matrix(Shape shape, const char* name, const float16_t* src) { + std::cout << name << " = [\n"; + for (size_t n = 0; n < shape.n; n++) { + std::cout << "\n"; + for (size_t y = 0; y < shape.h; ++y) { + std::cout << " ["; + for (size_t x = 0; x < shape.w; x++) { + std::cout << "["; + for (size_t c = 0; c < shape.c; c++) { + if (c != 0) std::cout << ","; + std::cout << std::setprecision(0) << std::fixed + << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c]; + } + std::cout << "] "; + } + std::cout << ("],\n"); + } + } + std::cout << ("]\n\n"); +} + +// Verify the micro-kernel output matches the reference implementation +bool is_output_correct( + size_t num_rows, size_t num_cols, const float16_t tolerance, const float16_t* ref, const float16_t* act) { + bool is_valid = true; + int count = 0; + for (size_t i = 0; i < num_rows * num_cols; ++i) { + if ((std::fabs((ref[i] - act[i]) / act[i])) > tolerance) { + const size_t x = i % num_cols; + const size_t y = i / num_cols; + count++; + std::cout << std::setprecision(5) << std::fixed << "ERROR![" << y << "][" << x << "]: ref=" << ref[i] + << " vs. act=" << act[i] << "\n"; + + is_valid = false; + } + } + std::cout << "\n\nThere are " << count << " mismatches." << std::endl; + return is_valid; +} +} // namespace + +size_t round_up_division(size_t a, size_t b) { + return (a + b - 1) / b; +} + +int main() { + // Input tensor in {NHWC} format. + Shape input{5, 32, 32, 3}; // Layout : [N, H, W, C] + Shape kernel{1, 3, 3, 3, 2}; // Layout : [1, KH, KW, CI, CO] + Shape output{5, 30, 30, 2}; // Layout : [N, H, W, C] + + // This example only supports default conv arguments. + const ConvInfo conv_info; + + // Init Input buffers. + VEC_F16 in(input.size()); + VEC_F16 filter(kernel.size()); + VEC_F16 bias(output.c); + fill_matrix(in.size(), in.data(), 0.1f); + fill_matrix(filter.size(), filter.data(), 0.01f); + fill_matrix(bias.size(), bias.data(), 1.f); + + // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1) + // N - Equivalent to output.c + // K - Width of LHS after im2col - is equivalent to (input_shape.c * kernel.w * kernel.w) when num_groups = 0 and no + // padding + const size_t M = input.n * ((input.w - kernel.w) + 1) * ((input.h - kernel.h) + 1); + const size_t K = (input.c * kernel.w * kernel.h); + const size_t N = kernel.size() / K; + + // Chunking is done in channel dimension (lowest input dim) + const size_t k_chunk_length = input.c; + const size_t k_chunk_count = K / k_chunk_length; + + // Check all shapes are valid. + assert(output.size() == M * N); + assert(input.n == output.n); + assert(kernel.c == input.c && kernel.co == output.c); + assert(kernel.n == 1); + + // ------------------------------------------ + // 1. Pack LHS - Create Indirection buffer. + // ------------------------------------------ + + const size_t m_step = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme(); + const size_t itable_rows = k_chunk_count * round_up_division(M, m_step); + const size_t itable_cols = m_step; + + const size_t out_hw_size = output.h * output.w; + const size_t in_hwc_size = input.size() / input.n; + + float16_t* zero_buffer = in.data(); + + // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step. + // Number of blocks is = round_up_division(M, m_step) + const size_t block_size = k_chunk_count * m_step; + std::vector indirect_table(itable_cols * itable_rows, zero_buffer); + + for (size_t batch_idx = 0; batch_idx < output.n; batch_idx++) { + // We iterate over OH * OW dims and retrieve a pointer to relevant input index. + for (size_t out_idx = 0; out_idx < out_hw_size; out_idx++) { + const size_t output_x = out_idx % output.w; + const size_t output_y = out_idx / output.w; + + // Calculates column and row offsets for itable index with respect to block + size_t block_start_x = (((batch_idx * out_hw_size) + out_idx) % m_step); + size_t block_start_y = (((batch_idx * out_hw_size) + out_idx) / m_step); + + // These filter loops will fill the indirection table column-wise for kh*kw elements. + for (size_t kernel_y = 0; kernel_y < kernel.h; kernel_y++) { + const size_t input_y = + output_y * conv_info.stride_y + kernel_y * conv_info.dilation_y - conv_info.pad_top; + if (input_y < input.h) { + for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) { + size_t input_x = + output_x * conv_info.stride_x + kernel_x * conv_info.dilation_x - conv_info.pad_left; + size_t kernel_index = kernel_y * kernel.w + kernel_x; + + size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step; + + if (input_x < input.w) { + indirect_table[index] = + ((float16_t*)in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + + input_x * input.c); + } else { + indirect_table[index] = zero_buffer; + } + } + } else { + for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) { + size_t kernel_index = kernel_y * kernel.w + kernel_x; + size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step; + indirect_table[index] = zero_buffer; + } + } + } + } + } + + // Init Output buffers. + VEC_F16 act_output(output.size()); + VEC_F16 ref_output(output.size()); + + // ------------------------------------------------- + // 1b. Pack LHS and RHS. + // ------------------------------------------------- + + // Initialise LHS Packed buffer and call packing kernel. + auto lhs_packed_size = + kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(M, k_chunk_count, k_chunk_length); + auto rhs_packed_size = + kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(N, k_chunk_count, k_chunk_length); + + VEC_F16 packed_lhs(lhs_packed_size); + VEC_F16 packed_rhs(rhs_packed_size); + + kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme( + M, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr, packed_lhs.data()); + kai_run_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( + N, k_chunk_count, k_chunk_length, N * sizeof(float16_t), filter.data(), bias.data(), packed_rhs.data()); + + // ------------------------------------------------- + // 2. Perform matmul operation. + // ------------------------------------------------- + + kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa( + M, N, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(), act_output.data(), + N * sizeof(float16_t), clamp_min, clamp_max); + + // ------------------------------------------------- + // 3. Call reference and compare output. + // ------------------------------------------------- + convolution_layer_nhwc( + input, kernel, output, in.data(), filter.data(), bias.data(), ref_output.data(), clamp_min, clamp_max); + + print_matrix(output, "\nTarget : ", act_output.data()); + print_matrix(output, "\nREf : ", ref_output.data()); + + is_output_correct(M, N, 0.01f, ref_output.data(), act_output.data()); + + return 0; +} + +#endif // Architectural features check. -- GitLab From 78e669006065bc848991200bda01918b6dd8b624 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 20 May 2025 12:23:35 +0100 Subject: [PATCH 02/18] Add Conv2D example using FP16 IGEMM - Example demonstrates creating an indirect buffer using a Conv2D input tensor - Example demonstrates indirect buffer usage with imatmul kernels. Signed-off-by: Mohammed Suhail Munshi --- examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt | 2 +- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt index edc50297..916e2e38 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index b554ccb2..810c5caa 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // -- GitLab From 29a8d5778856f27410f6002af3b6d719a424687c Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Thu, 22 May 2025 15:31:02 +0100 Subject: [PATCH 03/18] Made suggested changes - Removed unused headers, removed unnecessary vectorization - Used ostream to print instead of stringstream - Removed usage of raw pointers where possible. - Made other minor suggested fixes Signed-off-by: Mohammed Suhail Munshi --- .../CMakeLists.txt | 17 +--- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 85 ++++++++++--------- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt index 916e2e38..be52dae3 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt @@ -6,37 +6,28 @@ cmake_minimum_required(VERSION 3.16) -project(matmul_clamp_f16_f16_f16p) +project(conv2d_imatmul_clamp_f16_f16_f16p) set(CMAKE_CXX_STANDARD 17) set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../) set(KAI_BUILD ${KAI_PATH}/build) -set(CMAKE_BUILD_TYPE Debug) +set(CMAKE_BUILD_TYPE Release) -include_directories( - ${KAI_PATH}/ - ${KAI_PATH}/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/ - ${KAI_PATH}/kai/ukernels/matmul/pack/) +include_directories(${KAI_PATH}) set(KAI_SOURCES ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.c ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.c) -set(KAI_HEADERS - ${KAI_PATH}/kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h - ${KAI_PATH}/kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h - ${KAI_PATH}/kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h) - # Files requires to build the executable add_executable( conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp ${KAI_SOURCES} - ${KAI_HEADERS} ) target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p - PRIVATE -march=armv8.2-a+sve+sve2 + PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize" ) target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 810c5caa..00439a5d 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -4,16 +4,17 @@ // SPDX-License-Identifier: Apache-2.0 // -// Example usage for Indirect GEMM with a convolution operation using two half-precision matrices. +// Example usage for Indirect GEMM with a convolution operation using two half-precision float matrices. // -#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2) -#error This file must be compiled for AArch64, FEAT_SVE2. +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ + !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#error This file must be compiled for AArch64, FEAT_FP16. #else // Architectural features check. -#include -#include +#include +#include #include #include #include @@ -31,21 +32,22 @@ constexpr float clamp_min = (float)-65504; constexpr float clamp_max = (float)65504; struct Shape { - int n; - int h; - int w; - int c; - int co = 1; // Used for only kernel shape. + size_t n; + size_t h; + size_t w; + size_t c; + size_t co = 1; // Used for only kernel shape. [[nodiscard]] auto size() const -> size_t { return n * h * w * c * co; } - std::string print() const { - std::stringstream ss; - ss << " [ " << n << " , " << h << " ," << w << " , " << c << ", " << co << " ] "; - return ss.str(); - } + friend std::ostream& operator<<(std::ostream& os, const Shape& shape); }; +std::ostream& operator<<(std::ostream& os, const Shape& shape) { + os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << ", " << shape.co << " ] "; + return os; +} + struct ConvInfo { // Conv info - we assume the following args. int stride_x = 1; @@ -70,27 +72,27 @@ struct ConvInfo { /// @param[in] clamp_max Max value to clamp final result /// @param[in] cinfo Input arguments for convolution void convolution_layer_nhwc( - Shape src, Shape weights, Shape dst, const float16_t* in, const float16_t* wei, const float16_t* bias, - float16_t* out, float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo = ConvInfo()) { + Shape src, Shape weights, Shape dst, const VEC_F16& in, const VEC_F16& wei, const VEC_F16& bias, VEC_F16& out, + float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo) { assert(cinfo.stride_x == 1 && cinfo.stride_y == 1); assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1); assert(cinfo.pad_left == 0 && cinfo.pad_right == 0); assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0); - for (int n = 0; n < src.n; ++n) { - for (int oh = 0; oh < dst.h; ++oh) { - for (int ow = 0; ow < dst.w; ++ow) { - for (int kh = 0; kh < weights.h; ++kh) { + for (size_t n = 0; n < src.n; ++n) { + for (size_t oh = 0; oh < dst.h; ++oh) { + for (size_t ow = 0; ow < dst.w; ++ow) { + for (size_t kh = 0; kh < weights.h; ++kh) { if (src.h <= (oh + kh)) continue; - for (int kw = 0; kw < weights.w; ++kw) { + for (size_t kw = 0; kw < weights.w; ++kw) { if (src.w <= (ow + kw)) continue; - for (int ic = 0; ic < src.c; ++ic) { + for (size_t ic = 0; ic < src.c; ++ic) { auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic; auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c); auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; - for (int oc = 0; oc < dst.c; ++oc) { + for (size_t oc = 0; oc < dst.c; ++oc) { // acc here. out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]); } @@ -99,7 +101,7 @@ void convolution_layer_nhwc( } // Accumulate bias here. - for (int oc = 0; oc < dst.c; ++oc) { + for (size_t oc = 0; oc < dst.c; ++oc) { // acc here. auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; out[out_idx + oc] += bias[oc]; @@ -109,21 +111,21 @@ void convolution_layer_nhwc( } // Loop and clamp output data. - for (int i = 0; i < dst.size(); i++) { + for (size_t i = 0; i < dst.size(); i++) { out[i] = (out[i] < clamp_min) ? clamp_min : out[i]; out[i] = (out[i] > clamp_max) ? clamp_max : out[i]; } } /// Fills the matrix with incremental values -void fill_matrix(size_t size, float16_t* dst, const float16_t weight) { +void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) { for (size_t i = 0; i < size; i++) { dst[i] = float16_t(i * weight); } } /// Print the matrix -void print_matrix(size_t num_rows, size_t num_cols, const char* name, const float16_t* src) { +void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) { std::cout << name << " = [\n"; for (size_t y = 0; y < num_rows; ++y) { std::cout << " ["; @@ -136,7 +138,7 @@ void print_matrix(size_t num_rows, size_t num_cols, const char* name, const floa } /// Print the matrix -void print_matrix(Shape shape, const char* name, const float16_t* src) { +void print_tensor(Shape shape, const char* name, const VEC_F16& src) { std::cout << name << " = [\n"; for (size_t n = 0; n < shape.n; n++) { std::cout << "\n"; @@ -145,7 +147,7 @@ void print_matrix(Shape shape, const char* name, const float16_t* src) { for (size_t x = 0; x < shape.w; x++) { std::cout << "["; for (size_t c = 0; c < shape.c; c++) { - if (c != 0) std::cout << ","; + if (c != 0) std::cout << " , "; std::cout << std::setprecision(0) << std::fixed << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c]; } @@ -159,7 +161,7 @@ void print_matrix(Shape shape, const char* name, const float16_t* src) { // Verify the micro-kernel output matches the reference implementation bool is_output_correct( - size_t num_rows, size_t num_cols, const float16_t tolerance, const float16_t* ref, const float16_t* act) { + size_t num_rows, size_t num_cols, const float16_t tolerance, const VEC_F16& ref, const VEC_F16& act) { bool is_valid = true; int count = 0; for (size_t i = 0; i < num_rows * num_cols; ++i) { @@ -195,9 +197,9 @@ int main() { VEC_F16 in(input.size()); VEC_F16 filter(kernel.size()); VEC_F16 bias(output.c); - fill_matrix(in.size(), in.data(), 0.1f); - fill_matrix(filter.size(), filter.data(), 0.01f); - fill_matrix(bias.size(), bias.data(), 1.f); + fill_matrix(in.size(), in, 0.1f); + fill_matrix(filter.size(), filter, 0.01f); + fill_matrix(bias.size(), bias, 1.f); // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1) // N - Equivalent to output.c @@ -259,8 +261,7 @@ int main() { if (input_x < input.w) { indirect_table[index] = - ((float16_t*)in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + - input_x * input.c); + (in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + input_x * input.c); } else { indirect_table[index] = zero_buffer; } @@ -309,13 +310,13 @@ int main() { // ------------------------------------------------- // 3. Call reference and compare output. // ------------------------------------------------- - convolution_layer_nhwc( - input, kernel, output, in.data(), filter.data(), bias.data(), ref_output.data(), clamp_min, clamp_max); - - print_matrix(output, "\nTarget : ", act_output.data()); - print_matrix(output, "\nREf : ", ref_output.data()); + convolution_layer_nhwc(input, kernel, output, in, filter, bias, ref_output, clamp_min, clamp_max, conv_info); - is_output_correct(M, N, 0.01f, ref_output.data(), act_output.data()); +#ifdef KAI_DEBUG + print_tensor(output, "\nTarget : ", act_output); + print_tensor(output, "\nREf : ", ref_output); +#endif // KAI_DEBUG + is_output_correct(M, N, 0.01f, ref_output, act_output); return 0; } -- GitLab From df561020e9f9b42a0d7081cac682ce4e318e3399 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Thu, 22 May 2025 17:46:36 +0100 Subject: [PATCH 04/18] Use std::clamp Signed-off-by: Mohammed Suhail Munshi --- examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt | 1 - .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt index be52dae3..2cf37a29 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt @@ -11,7 +11,6 @@ project(conv2d_imatmul_clamp_f16_f16_f16p) set(CMAKE_CXX_STANDARD 17) set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../) set(KAI_BUILD ${KAI_PATH}/build) -set(CMAKE_BUILD_TYPE Release) include_directories(${KAI_PATH}) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 00439a5d..4ff917d4 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -14,6 +14,7 @@ #include +#include #include #include #include @@ -112,8 +113,7 @@ void convolution_layer_nhwc( // Loop and clamp output data. for (size_t i = 0; i < dst.size(); i++) { - out[i] = (out[i] < clamp_min) ? clamp_min : out[i]; - out[i] = (out[i] > clamp_max) ? clamp_max : out[i]; + out[i] = std::clamp(out[i], clamp_min, clamp_max); } } -- GitLab From 6ca05c52d58010b9f21660c57216489b5e364036 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Wed, 28 May 2025 09:58:05 +0100 Subject: [PATCH 05/18] Apply suggested changes Signed-off-by: Mohammed Suhail Munshi --- .../CMakeLists.txt | 3 +- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 355 +++++++++--------- 2 files changed, 184 insertions(+), 174 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt index 2cf37a29..aef754c9 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt @@ -22,8 +22,7 @@ set(KAI_SOURCES # Files requires to build the executable add_executable( conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp - ${KAI_SOURCES} - ) + ${KAI_SOURCES}) target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize" diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 4ff917d4..2dec00dd 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -29,94 +29,139 @@ using VEC_F16 = std::vector; namespace { -constexpr float clamp_min = (float)-65504; -constexpr float clamp_max = (float)65504; +constexpr float clamp_min = -65504.0F; +constexpr float clamp_max = 65504.0F; struct Shape { size_t n; size_t h; size_t w; size_t c; - size_t co = 1; // Used for only kernel shape. [[nodiscard]] auto size() const -> size_t { - return n * h * w * c * co; + return n * h * w * c; } - friend std::ostream& operator<<(std::ostream& os, const Shape& shape); -}; - -std::ostream& operator<<(std::ostream& os, const Shape& shape) { - os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << ", " << shape.co << " ] "; - return os; -} -struct ConvInfo { - // Conv info - we assume the following args. - int stride_x = 1; - int stride_y = 1; - int pad_left = 0; - int pad_top = 0; - int pad_right = 0; - int pad_bottom = 0; - int dilation_x = 1; - int dilation_y = 1; +#ifdef KAI_DEBUG + friend std::ostream& operator<<(std::ostream& os, const Shape& shape) { + os << " [ " << shape.n << " , " << shape.h << " ," << shape.w << " , " << shape.c << " ] "; + return os; + } +#endif }; /// Perform a convolution operation in nhwc data format. -/// @param[in] src Shape of the input tensor in [N, H, W, C] DataFormat -/// @param[in] weights Shape of the weights tensor in [1, H, W, CI, CO] Format -/// @param[in] dst Shape of the output tensor in [N, H, W, C] DataFormat -/// @param[in] in half float pointer to start of input tensor -/// @param[in] wei half float pointer to start of weights tensor +/// @param[in] in_shape Shape of the input tensor in [N, H, W, C] DataFormat +/// @param[in] out_shape Shape of the output tensor in [N, H, W, C] DataFormat +/// @param[in] filter_height Height of convolution filter. +/// @param[in] filter_width Width of convolution filter. +/// @param[in] feature_map half float pointer to start of input tensor +/// @param[in] weights half float pointer to start of weights tensor /// @param[in] bias half float pointer to start of bias tensor /// @param[out] out half float pointer to start of output tensor /// @param[in] clamp_min Minimum value to clamp final result /// @param[in] clamp_max Max value to clamp final result -/// @param[in] cinfo Input arguments for convolution void convolution_layer_nhwc( - Shape src, Shape weights, Shape dst, const VEC_F16& in, const VEC_F16& wei, const VEC_F16& bias, VEC_F16& out, - float16_t clamp_min, float16_t clamp_max, ConvInfo cinfo) { - assert(cinfo.stride_x == 1 && cinfo.stride_y == 1); - assert(cinfo.dilation_x == 1 && cinfo.dilation_y == 1); - assert(cinfo.pad_left == 0 && cinfo.pad_right == 0); - assert(cinfo.pad_bottom == 0 && cinfo.pad_top == 0); - - for (size_t n = 0; n < src.n; ++n) { - for (size_t oh = 0; oh < dst.h; ++oh) { - for (size_t ow = 0; ow < dst.w; ++ow) { - for (size_t kh = 0; kh < weights.h; ++kh) { - if (src.h <= (oh + kh)) continue; - for (size_t kw = 0; kw < weights.w; ++kw) { - if (src.w <= (ow + kw)) continue; - - for (size_t ic = 0; ic < src.c; ++ic) { - auto in_idx = ((n * src.h + (oh + kh)) * src.w + (ow + kw)) * src.c + ic; - auto ker_idx = (((kh * weights.w + kw) * src.c + ic) * dst.c); - auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; - - for (size_t oc = 0; oc < dst.c; ++oc) { - // acc here. - out[out_idx + oc] += (in[in_idx] * wei[ker_idx + oc]); + Shape in_shape, Shape out_shape, const size_t filter_height, const size_t filter_width, const VEC_F16& feature_map, + const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, float16_t clamp_max) { + for (size_t n = 0; n < out_shape.n; ++n) { + for (size_t out_h = 0; out_h < out_shape.h; ++out_h) { + for (size_t out_w = 0; out_w < out_shape.w; ++out_w) { + // Apply filter to feature map. + for (size_t kernel_h = 0; kernel_h < filter_height; ++kernel_h) { + if (in_shape.h <= (out_h + kernel_h)) continue; + for (size_t kernel_w = 0; kernel_w < filter_width; ++kernel_w) { + if (in_shape.w <= (out_w + kernel_w)) continue; + + for (size_t ic = 0; ic < in_shape.c; ++ic) { + auto in_idx = + ((n * in_shape.h + (out_h + kernel_h)) * in_shape.w + (out_w + kernel_w)) * in_shape.c + + ic; + auto weights_idx = (((kernel_h * filter_width + kernel_w) * in_shape.c + ic) * out_shape.c); + auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c; + + for (size_t oc = 0; oc < out_shape.c; ++oc) { + // Perform actual accumulation and store in output vector + out[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]); } } } } - // Accumulate bias here. - for (size_t oc = 0; oc < dst.c; ++oc) { - // acc here. - auto out_idx = ((n * dst.h + oh) * dst.w + ow) * dst.c; + // Perform bias accumulation for channel idx and store in output vector. + for (size_t oc = 0; oc < out_shape.c; ++oc) { + auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c; out[out_idx + oc] += bias[oc]; } } } } - // Loop and clamp output data. - for (size_t i = 0; i < dst.size(); i++) { + // Apply clamping in-place to output of covolution. + for (size_t i = 0; i < out_shape.size(); i++) { out[i] = std::clamp(out[i], clamp_min, clamp_max); } } +/// Fill a provided indirection table according to tensor shape parameters. +/// @param[in] feature_map Input feature map tensor +/// @param[out] indirection_table Indirection buffer to fill in place. +/// @param[in] pad_buffer Pointer to start of padding. +/// @param[in] in_shape Shape of input tensor [N,H,W,C] format. +/// @param[in] out_shape Shape of output tensor [N,H,W,C] format. +/// @param[in] filter_height Height of convolution filter. +/// @param[in] filter_width Width of convolution filter. +/// @param[in] itable_cols Number of columns in indirection table (m_step) +std::vector init_indirection_table( + VEC_F16& feature_map, std::vector& indirect_table, float16_t* pad_buffer, const Shape& in_shape, + const Shape& out_shape, const size_t filter_height, const size_t filter_width, const size_t itable_cols) { + // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step. + // Number of blocks is = round_up_division(M, m_step) + const size_t block_size = filter_height * filter_width * itable_cols; + const size_t in_hwc_size = in_shape.h * in_shape.w * in_shape.c; + + // The following code iterates over the first 3 dims of the output tensor and retrieves KH*KW number of pointers to + // the input matrix for each idx. These pointers are stored columnwise in the itable, beginning with an offset. + for (size_t batch_idx = 0; batch_idx < out_shape.n; batch_idx++) { + for (size_t output_y = 0; output_y < out_shape.h; output_y++) { + for (size_t output_x = 0; output_x < out_shape.w; output_x++) { + // Calculates column and row offsets for itable index with respect to current block location and itable + // column length (equivalent to m_step) The block start x/y offsets ensure the data is padded in the + // format expected by the LHS Packing kernel. + size_t block_start_x = + (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) % itable_cols); + size_t block_start_y = + (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) / itable_cols); + for (size_t kernel_y = 0; kernel_y < filter_height; kernel_y++) { + const size_t input_y = output_y + kernel_y; + if (input_y < in_shape.h) { + for (size_t kernel_x = 0; kernel_x < filter_width; kernel_x++) { + size_t input_x = output_x + kernel_x; + size_t kernel_index = kernel_y * filter_width + kernel_x; + size_t index = (block_start_y * block_size) + block_start_x + kernel_index * itable_cols; + + if (input_x < in_shape.w) { + indirect_table[index] = + (feature_map.data() + batch_idx * in_hwc_size + input_y * in_shape.w * in_shape.c + + input_x * in_shape.c); + } else { + indirect_table[index] = pad_buffer; + } + } + } else { + for (size_t kernel_x = 0; kernel_x < filter_width; kernel_x++) { + size_t kernel_index = kernel_y * filter_width + kernel_x; + size_t index = (block_start_y * block_size) + block_start_x + kernel_index * itable_cols; + indirect_table[index] = pad_buffer; + } + } + } + } + } + } + + return indirect_table; +} + /// Fills the matrix with incremental values void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) { for (size_t i = 0; i < size; i++) { @@ -124,20 +169,29 @@ void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) { } } -/// Print the matrix +#ifdef KAI_DEBUG +/// Function prints a matrix according to the rows and columns specified. +/// @param[in] num_rows Number of rows in the matrix. +/// @param[in] num_cols Number of columns in the matrix. +/// @param[in] name The name of the matrix to be printed. This will be included in the output. +/// @param[in] src A vector of F16 elements representing the matrix. void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) { - std::cout << name << " = [\n"; - for (size_t y = 0; y < num_rows; ++y) { + std::cout << "\n" << name << " = [\n"; + for (size_t row = 0; row < num_rows; ++row) { std::cout << " ["; - for (size_t x = 0; x < num_cols; ++x) { - std::cout << std::setprecision(0) << std::fixed << src[y * num_cols + x] << ", "; + for (size_t col = 0; col < num_cols; ++col) { + std::cout << std::setprecision(0) << std::fixed << src[row * num_cols + col] << ", "; } std::cout << ("],\n"); } std::cout << ("]\n\n"); } -/// Print the matrix +/// Function prints a tensor in NHWC format. +/// Width and channels are printed on the same line. Square brackets are used to denote dimensions. +/// @param[in] shape A struct containing the NHWC shape of the tensor. +/// @param[in] name Name of the tensor +/// @param[in] src A vector of F16 elements representing the tensor. void print_tensor(Shape shape, const char* name, const VEC_F16& src) { std::cout << name << " = [\n"; for (size_t n = 0; n < shape.n; n++) { @@ -148,7 +202,7 @@ void print_tensor(Shape shape, const char* name, const VEC_F16& src) { std::cout << "["; for (size_t c = 0; c < shape.c; c++) { if (c != 0) std::cout << " , "; - std::cout << std::setprecision(0) << std::fixed + std::cout << std::setprecision(1) << std::fixed << src[n * shape.h * shape.w * shape.c + y * shape.w * shape.c + x * shape.c + c]; } std::cout << "] "; @@ -158,6 +212,7 @@ void print_tensor(Shape shape, const char* name, const VEC_F16& src) { } std::cout << ("]\n\n"); } +#endif // KAI_DEBUG // Verify the micro-kernel output matches the reference implementation bool is_output_correct( @@ -178,145 +233,101 @@ bool is_output_correct( std::cout << "\n\nThere are " << count << " mismatches." << std::endl; return is_valid; } -} // namespace size_t round_up_division(size_t a, size_t b) { return (a + b - 1) / b; } +} // namespace int main() { - // Input tensor in {NHWC} format. - Shape input{5, 32, 32, 3}; // Layout : [N, H, W, C] - Shape kernel{1, 3, 3, 3, 2}; // Layout : [1, KH, KW, CI, CO] - Shape output{5, 30, 30, 2}; // Layout : [N, H, W, C] - - // This example only supports default conv arguments. - const ConvInfo conv_info; - - // Init Input buffers. - VEC_F16 in(input.size()); - VEC_F16 filter(kernel.size()); - VEC_F16 bias(output.c); - fill_matrix(in.size(), in, 0.1f); - fill_matrix(filter.size(), filter, 0.01f); + // Arguments for convolution operation. + // Padding must be valid + const size_t batch_size = 5; + const size_t input_height = 32; + const size_t input_width = 32; + const size_t input_channels = 3; + const size_t filter_height = 3; + const size_t filter_width = 3; + const size_t out_channels = 2; + + // Use shape arguments to define tensor shapes in NHWC Format. + const Shape in_shape{batch_size, input_height, input_width, input_channels}; + const Shape weights_shape{filter_height, filter_width, input_channels, out_channels}; + const Shape out_shape{ + batch_size, (input_height - filter_height + 1), (input_width - filter_width + 1), out_channels}; + + // Define and Fill Input Tensors for operation using shapes + VEC_F16 feature_map(in_shape.size()); + VEC_F16 weights(weights_shape.size()); + VEC_F16 bias(out_channels); + + // Fill by iterating over in 1D and multiplying idx by the weight supplied as argument. + fill_matrix(feature_map.size(), feature_map, 0.1f); + fill_matrix(weights.size(), weights, 0.2f); fill_matrix(bias.size(), bias, 1.f); - // M - Equivalent to height of LHS after im2col : (width - k_width + 1) * (height - k_height + 1) - // N - Equivalent to output.c - // K - Width of LHS after im2col - is equivalent to (input_shape.c * kernel.w * kernel.w) when num_groups = 0 and no - // padding - const size_t M = input.n * ((input.w - kernel.w) + 1) * ((input.h - kernel.h) + 1); - const size_t K = (input.c * kernel.w * kernel.h); - const size_t N = kernel.size() / K; - - // Chunking is done in channel dimension (lowest input dim) - const size_t k_chunk_length = input.c; - const size_t k_chunk_count = K / k_chunk_length; - - // Check all shapes are valid. - assert(output.size() == M * N); - assert(input.n == output.n); - assert(kernel.c == input.c && kernel.co == output.c); - assert(kernel.n == 1); - - // ------------------------------------------ - // 1. Pack LHS - Create Indirection buffer. - // ------------------------------------------ - - const size_t m_step = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme(); - const size_t itable_rows = k_chunk_count * round_up_division(M, m_step); - const size_t itable_cols = m_step; - - const size_t out_hw_size = output.h * output.w; - const size_t in_hwc_size = input.size() / input.n; - - float16_t* zero_buffer = in.data(); - - // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step. - // Number of blocks is = round_up_division(M, m_step) - const size_t block_size = k_chunk_count * m_step; - std::vector indirect_table(itable_cols * itable_rows, zero_buffer); - - for (size_t batch_idx = 0; batch_idx < output.n; batch_idx++) { - // We iterate over OH * OW dims and retrieve a pointer to relevant input index. - for (size_t out_idx = 0; out_idx < out_hw_size; out_idx++) { - const size_t output_x = out_idx % output.w; - const size_t output_y = out_idx / output.w; - - // Calculates column and row offsets for itable index with respect to block - size_t block_start_x = (((batch_idx * out_hw_size) + out_idx) % m_step); - size_t block_start_y = (((batch_idx * out_hw_size) + out_idx) / m_step); - - // These filter loops will fill the indirection table column-wise for kh*kw elements. - for (size_t kernel_y = 0; kernel_y < kernel.h; kernel_y++) { - const size_t input_y = - output_y * conv_info.stride_y + kernel_y * conv_info.dilation_y - conv_info.pad_top; - if (input_y < input.h) { - for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) { - size_t input_x = - output_x * conv_info.stride_x + kernel_x * conv_info.dilation_x - conv_info.pad_left; - size_t kernel_index = kernel_y * kernel.w + kernel_x; - - size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step; - - if (input_x < input.w) { - indirect_table[index] = - (in.data() + batch_idx * in_hwc_size + input_y * input.w * input.c + input_x * input.c); - } else { - indirect_table[index] = zero_buffer; - } - } - } else { - for (size_t kernel_x = 0; kernel_x < kernel.w; kernel_x++) { - size_t kernel_index = kernel_y * kernel.w + kernel_x; - size_t index = (block_start_y * block_size) + block_start_x + kernel_index * m_step; - indirect_table[index] = zero_buffer; - } - } - } - } - } - - // Init Output buffers. - VEC_F16 act_output(output.size()); - VEC_F16 ref_output(output.size()); + // The following are used as parameters in the indirection kernels + const size_t out_nhw_size = out_shape.n * out_shape.h * out_shape.w; + const size_t k_chunk_length = input_channels; + const size_t k_chunk_count = filter_height * filter_width; // ------------------------------------------------- - // 1b. Pack LHS and RHS. + // 1. Create Indirection buffer. // ------------------------------------------------- + // Define and Fill the indirection table in the format expected of the LHS Indirection Matmul kernel. + // NOTE: out_nhw_size is equivalent to M argument for Indirection kernels. + // out_channels is equivalent to N argument for Indirection kernels. + const size_t itable_cols = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme(); + const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols); + std::vector indirect_table(itable_cols * itable_rows); + + // Start of input feature map is passed as padding pointer, this is not neccessary. + float16_t* pad_buffer = feature_map.data(); + init_indirection_table( + feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols); - // Initialise LHS Packed buffer and call packing kernel. + // ------------------------------------------------- + // 2. Pack LHS and RHS. + // ------------------------------------------------- auto lhs_packed_size = - kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(M, k_chunk_count, k_chunk_length); - auto rhs_packed_size = - kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme(N, k_chunk_count, k_chunk_length); + kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(out_nhw_size, k_chunk_count, k_chunk_length); + auto rhs_packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( + out_channels, k_chunk_count, k_chunk_length); VEC_F16 packed_lhs(lhs_packed_size); VEC_F16 packed_rhs(rhs_packed_size); + // Padding is not used in the indirection buffer (as padding is valid), therefore pad_ptr is nullptr + // Ptr offset is provided as 0 as it is not needed to apply an offset to each valid pointer provided in the table in + // this case. kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme( - M, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr, packed_lhs.data()); + out_nhw_size, k_chunk_count, k_chunk_length, (const void**)indirect_table.data(), 0, nullptr, + packed_lhs.data()); kai_run_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( - N, k_chunk_count, k_chunk_length, N * sizeof(float16_t), filter.data(), bias.data(), packed_rhs.data()); + out_channels, k_chunk_count, k_chunk_length, out_channels * sizeof(float16_t), weights.data(), bias.data(), + packed_rhs.data()); // ------------------------------------------------- - // 2. Perform matmul operation. + // 3. Perform matmul operation and call reference, then compare. // ------------------------------------------------- + VEC_F16 act_output(out_shape.size()); + VEC_F16 ref_output(out_shape.size()); kai_run_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa( - M, N, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(), act_output.data(), - N * sizeof(float16_t), clamp_min, clamp_max); + out_nhw_size, out_channels, k_chunk_count, k_chunk_length, packed_lhs.data(), packed_rhs.data(), + act_output.data(), out_channels * sizeof(float16_t), clamp_min, clamp_max); - // ------------------------------------------------- - // 3. Call reference and compare output. - // ------------------------------------------------- - convolution_layer_nhwc(input, kernel, output, in, filter, bias, ref_output, clamp_min, clamp_max, conv_info); + convolution_layer_nhwc( + in_shape, out_shape, filter_height, filter_width, feature_map, weights, bias, ref_output, clamp_min, clamp_max); #ifdef KAI_DEBUG + std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape + << " Output Shape : " << out_shape << std::endl; print_tensor(output, "\nTarget : ", act_output); print_tensor(output, "\nREf : ", ref_output); #endif // KAI_DEBUG - is_output_correct(M, N, 0.01f, ref_output, act_output); + + is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output); return 0; } -- GitLab From 2d41918e3a43b888e6f1569f05534479874476cb Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Wed, 28 May 2025 11:55:57 +0100 Subject: [PATCH 06/18] Minor changes to improve comment documation Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 2dec00dd..bed277bf 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -19,6 +19,7 @@ #include #include #include +#include // Include micro-kernel variants #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" @@ -61,8 +62,9 @@ struct Shape { /// @param[in] clamp_min Minimum value to clamp final result /// @param[in] clamp_max Max value to clamp final result void convolution_layer_nhwc( - Shape in_shape, Shape out_shape, const size_t filter_height, const size_t filter_width, const VEC_F16& feature_map, - const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, float16_t clamp_max) { + const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width, + const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, + float16_t clamp_max) { for (size_t n = 0; n < out_shape.n; ++n) { for (size_t out_h = 0; out_h < out_shape.h; ++out_h) { for (size_t out_w = 0; out_w < out_shape.w; ++out_w) { @@ -162,7 +164,10 @@ std::vector init_indirection_table( return indirect_table; } -/// Fills the matrix with incremental values +/// Fills the matrix with incremental values according to the provided weight. +/// @param[in] size Total number of elements to fill in passed vector;. +/// @param[in] dst Vector representing a tensor to fill. +/// @param[in] weight A weight value to increment by. void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) { for (size_t i = 0; i < size; i++) { dst[i] = float16_t(i * weight); @@ -192,7 +197,7 @@ void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_ /// @param[in] shape A struct containing the NHWC shape of the tensor. /// @param[in] name Name of the tensor /// @param[in] src A vector of F16 elements representing the tensor. -void print_tensor(Shape shape, const char* name, const VEC_F16& src) { +void print_tensor(const Shape& shape, const char* name, const VEC_F16& src) { std::cout << name << " = [\n"; for (size_t n = 0; n < shape.n; n++) { std::cout << "\n"; @@ -261,7 +266,7 @@ int main() { VEC_F16 weights(weights_shape.size()); VEC_F16 bias(out_channels); - // Fill by iterating over in 1D and multiplying idx by the weight supplied as argument. + // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0. fill_matrix(feature_map.size(), feature_map, 0.1f); fill_matrix(weights.size(), weights, 0.2f); fill_matrix(bias.size(), bias, 1.f); -- GitLab From c594b7002bb2ee18957f9c8adfca85d4064d4c07 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Wed, 28 May 2025 12:12:54 +0100 Subject: [PATCH 07/18] Bug fixes Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index bed277bf..096d591f 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -63,8 +63,8 @@ struct Shape { /// @param[in] clamp_max Max value to clamp final result void convolution_layer_nhwc( const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width, - const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float16_t clamp_min, - float16_t clamp_max) { + const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float clamp_min, + float clamp_max) { for (size_t n = 0; n < out_shape.n; ++n) { for (size_t out_h = 0; out_h < out_shape.h; ++out_h) { for (size_t out_w = 0; out_w < out_shape.w; ++out_w) { @@ -100,7 +100,7 @@ void convolution_layer_nhwc( // Apply clamping in-place to output of covolution. for (size_t i = 0; i < out_shape.size(); i++) { - out[i] = std::clamp(out[i], clamp_min, clamp_max); + out[i] = std::clamp(out[i], static_cast(clamp_min), static_cast(clamp_max)); } } @@ -130,9 +130,9 @@ std::vector init_indirection_table( // column length (equivalent to m_step) The block start x/y offsets ensure the data is padded in the // format expected by the LHS Packing kernel. size_t block_start_x = - (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) % itable_cols); + (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.w + output_x)) % itable_cols); size_t block_start_y = - (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.h + output_x)) / itable_cols); + (((batch_idx * out_shape.h * out_shape.w) + (output_y * out_shape.w + output_x)) / itable_cols); for (size_t kernel_y = 0; kernel_y < filter_height; kernel_y++) { const size_t input_y = output_y + kernel_y; if (input_y < in_shape.h) { @@ -175,23 +175,6 @@ void fill_matrix(size_t size, VEC_F16& dst, const float16_t weight) { } #ifdef KAI_DEBUG -/// Function prints a matrix according to the rows and columns specified. -/// @param[in] num_rows Number of rows in the matrix. -/// @param[in] num_cols Number of columns in the matrix. -/// @param[in] name The name of the matrix to be printed. This will be included in the output. -/// @param[in] src A vector of F16 elements representing the matrix. -void print_matrix(size_t num_rows, size_t num_cols, const char* name, const VEC_F16& src) { - std::cout << "\n" << name << " = [\n"; - for (size_t row = 0; row < num_rows; ++row) { - std::cout << " ["; - for (size_t col = 0; col < num_cols; ++col) { - std::cout << std::setprecision(0) << std::fixed << src[row * num_cols + col] << ", "; - } - std::cout << ("],\n"); - } - std::cout << ("]\n\n"); -} - /// Function prints a tensor in NHWC format. /// Width and channels are printed on the same line. Square brackets are used to denote dimensions. /// @param[in] shape A struct containing the NHWC shape of the tensor. @@ -251,8 +234,8 @@ int main() { const size_t input_height = 32; const size_t input_width = 32; const size_t input_channels = 3; - const size_t filter_height = 3; - const size_t filter_width = 3; + const size_t filter_height = 5; + const size_t filter_width = 2; const size_t out_channels = 2; // Use shape arguments to define tensor shapes in NHWC Format. @@ -261,6 +244,11 @@ int main() { const Shape out_shape{ batch_size, (input_height - filter_height + 1), (input_width - filter_width + 1), out_channels}; +#ifdef KAI_DEBUG + std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape + << " Output Shape : " << out_shape << std::endl; +#endif // KAI_DEBUG + // Define and Fill Input Tensors for operation using shapes VEC_F16 feature_map(in_shape.size()); VEC_F16 weights(weights_shape.size()); @@ -268,7 +256,7 @@ int main() { // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0. fill_matrix(feature_map.size(), feature_map, 0.1f); - fill_matrix(weights.size(), weights, 0.2f); + fill_matrix(weights.size(), weights, 0.1f); fill_matrix(bias.size(), bias, 1.f); // The following are used as parameters in the indirection kernels @@ -326,10 +314,8 @@ int main() { in_shape, out_shape, filter_height, filter_width, feature_map, weights, bias, ref_output, clamp_min, clamp_max); #ifdef KAI_DEBUG - std::cout << "\nInput Shape : " << in_shape << " Kernel Shape : " << weights_shape - << " Output Shape : " << out_shape << std::endl; - print_tensor(output, "\nTarget : ", act_output); - print_tensor(output, "\nREf : ", ref_output); + print_tensor(out_shape, "\nTarget : ", act_output); + print_tensor(out_shape, "\nREf : ", ref_output); #endif // KAI_DEBUG is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output); -- GitLab From 29d3ed28743f4b9556a51a42a9a40eb75b4c5c1b Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Wed, 28 May 2025 14:09:15 +0100 Subject: [PATCH 08/18] Use FP32 Accumulator in Conv2D reference Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 096d591f..c2561b4a 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -65,6 +65,9 @@ void convolution_layer_nhwc( const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width, const VEC_F16& feature_map, const VEC_F16& weights, const VEC_F16& bias, VEC_F16& out, float clamp_min, float clamp_max) { + // We accumulate in FP32 and clamp later. + std::vector acc(out_shape.size()); + for (size_t n = 0; n < out_shape.n; ++n) { for (size_t out_h = 0; out_h < out_shape.h; ++out_h) { for (size_t out_w = 0; out_w < out_shape.w; ++out_w) { @@ -83,7 +86,7 @@ void convolution_layer_nhwc( for (size_t oc = 0; oc < out_shape.c; ++oc) { // Perform actual accumulation and store in output vector - out[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]); + acc[out_idx + oc] += (feature_map[in_idx] * weights[weights_idx + oc]); } } } @@ -92,15 +95,15 @@ void convolution_layer_nhwc( // Perform bias accumulation for channel idx and store in output vector. for (size_t oc = 0; oc < out_shape.c; ++oc) { auto out_idx = ((n * out_shape.h + out_h) * out_shape.w + out_w) * out_shape.c; - out[out_idx + oc] += bias[oc]; + acc[out_idx + oc] += bias[oc]; } } } } - // Apply clamping in-place to output of covolution. + // Apply clamping to accumulator, cast to FP16 and store in output vector at the same idx. for (size_t i = 0; i < out_shape.size(); i++) { - out[i] = std::clamp(out[i], static_cast(clamp_min), static_cast(clamp_max)); + out[i] = static_cast(std::clamp(acc[i], clamp_min, clamp_max)); } } @@ -256,7 +259,7 @@ int main() { // Fill by iterating each element and incrementing each time by the provided weight, beginning at 0. fill_matrix(feature_map.size(), feature_map, 0.1f); - fill_matrix(weights.size(), weights, 0.1f); + fill_matrix(weights.size(), weights, 0.01f); fill_matrix(bias.size(), bias, 1.f); // The following are used as parameters in the indirection kernels @@ -318,7 +321,7 @@ int main() { print_tensor(out_shape, "\nREf : ", ref_output); #endif // KAI_DEBUG - is_output_correct(out_nhw_size, out_channels, 0.01f, ref_output, act_output); + is_output_correct(out_nhw_size, out_channels, 0.0001f, ref_output, act_output); return 0; } -- GitLab From c943fdbebc6bfe5666934c6e73c0a2a4ef546beb Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Thu, 29 May 2025 12:32:19 +0100 Subject: [PATCH 09/18] Add missing header from standard library. Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index c2561b4a..7c0191b2 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include -- GitLab From 6f41537e2ab93374fa7ebb0f1645e80ad1e6fcee Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Thu, 29 May 2025 15:31:36 +0100 Subject: [PATCH 10/18] Add HW check for SME2 Extension Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 7c0191b2..06fbc67c 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -22,6 +22,14 @@ #include #include +#if defined(__linux__) +#include +#endif // defined(__linux__) + +#if defined(__APPLE__) +#include +#endif // defined(__APPLE__) + // Include micro-kernel variants #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" @@ -229,9 +237,33 @@ bool is_output_correct( size_t round_up_division(size_t a, size_t b) { return (a + b - 1) / b; } + +// Check for hardware support for SME2 (required by IMatmul kernels). +bool has_sme2_support() { +#if defined(__linux__) +#ifndef HWCAP2_SME2 + constexpr uint64_t HWCAP2_SME2 = 1UL << 37; +#endif // HWCAP2_SME2 + unsigned long hwcaps = getauxval(AT_HWCAP2); + if (hwcaps & HWCAP2_SME2) return true; +#elif defined(__APPLE__) + uint32_t value{}; + size_t len = sizeof(value); + if (sysctlbyname("hw.optional.arm.FEAT_SME2", &value, &len, NULL, 0) != 0) return false; + return value; +#endif // OS check + return false; +} + } // namespace int main() { + // Check for SME support and skip tests if not supported. + if (!has_sme2_support()) { + printf("\nThis example requires support for the SME2 CPU extension.\n"); + return 0; + } + // Arguments for convolution operation. // Padding must be valid const size_t batch_size = 5; -- GitLab From aa36d2de50cf3b337f73df997db439281de3a0ae Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Fri, 30 May 2025 14:39:59 +0100 Subject: [PATCH 11/18] Remove hardware check and prevent sme example from running in tests. Signed-off-by: Mohammed Suhail Munshi --- .gitlab-ci.yml | 1 + .../CMakeLists.txt | 0 .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 32 ------------------- 3 files changed, 1 insertion(+), 32 deletions(-) rename examples/{conv2d_imatmul_clamp_f16_f16_f16p => conv2d_imatmul_clamp_f16_f16_f16p_sme2}/CMakeLists.txt (100%) rename examples/{conv2d_imatmul_clamp_f16_f16_f16p => conv2d_imatmul_clamp_f16_f16_f16p_sme2}/conv2d_imatmul_clamp_f16_f16_f16p.cpp (94%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2f2d56d8..23442725 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -122,6 +122,7 @@ build-examples: - > for EXAMPLE in `ls examples -1`; do if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then + [[ $EXAMPLE == *sme* ]] && continue echo "-----------------------------------------------------------" echo "Build examples/${EXAMPLE}" echo "-----------------------------------------------------------" diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt similarity index 100% rename from examples/conv2d_imatmul_clamp_f16_f16_f16p/CMakeLists.txt rename to examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp similarity index 94% rename from examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp rename to examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 06fbc67c..7c0191b2 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -22,14 +22,6 @@ #include #include -#if defined(__linux__) -#include -#endif // defined(__linux__) - -#if defined(__APPLE__) -#include -#endif // defined(__APPLE__) - // Include micro-kernel variants #include "kai/ukernels/matmul/imatmul_clamp_f16_f16p_f16p/kai_imatmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" @@ -237,33 +229,9 @@ bool is_output_correct( size_t round_up_division(size_t a, size_t b) { return (a + b - 1) / b; } - -// Check for hardware support for SME2 (required by IMatmul kernels). -bool has_sme2_support() { -#if defined(__linux__) -#ifndef HWCAP2_SME2 - constexpr uint64_t HWCAP2_SME2 = 1UL << 37; -#endif // HWCAP2_SME2 - unsigned long hwcaps = getauxval(AT_HWCAP2); - if (hwcaps & HWCAP2_SME2) return true; -#elif defined(__APPLE__) - uint32_t value{}; - size_t len = sizeof(value); - if (sysctlbyname("hw.optional.arm.FEAT_SME2", &value, &len, NULL, 0) != 0) return false; - return value; -#endif // OS check - return false; -} - } // namespace int main() { - // Check for SME support and skip tests if not supported. - if (!has_sme2_support()) { - printf("\nThis example requires support for the SME2 CPU extension.\n"); - return 0; - } - // Arguments for convolution operation. // Padding must be valid const size_t batch_size = 5; -- GitLab From 325b4dfb8742c10f044f32b788b869c79384a391 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Mon, 2 Jun 2025 11:45:39 +0100 Subject: [PATCH 12/18] Make suggested changes Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 7c0191b2..e5028667 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -31,8 +31,8 @@ using VEC_F16 = std::vector; namespace { -constexpr float clamp_min = -65504.0F; -constexpr float clamp_max = 65504.0F; +constexpr float clamp_min = -9000.0F; +constexpr float clamp_max = 9000.0F; struct Shape { size_t n; @@ -117,9 +117,10 @@ void convolution_layer_nhwc( /// @param[in] filter_height Height of convolution filter. /// @param[in] filter_width Width of convolution filter. /// @param[in] itable_cols Number of columns in indirection table (m_step) -std::vector init_indirection_table( - VEC_F16& feature_map, std::vector& indirect_table, float16_t* pad_buffer, const Shape& in_shape, - const Shape& out_shape, const size_t filter_height, const size_t filter_width, const size_t itable_cols) { +std::vector init_indirection_table( + const VEC_F16& feature_map, std::vector& indirect_table, const float16_t* pad_buffer, + const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width, + const size_t itable_cols) { // The indirection buffer here is a series of blocks each of size k_chunk_count * m_step. // Number of blocks is = round_up_division(M, m_step) const size_t block_size = filter_height * filter_width * itable_cols; @@ -276,10 +277,10 @@ int main() { // out_channels is equivalent to N argument for Indirection kernels. const size_t itable_cols = kai_get_m_step_lhs_imatmul_pack_x16p2vlx2_x16p_sme(); const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols); - std::vector indirect_table(itable_cols * itable_rows); + std::vector indirect_table(itable_cols * itable_rows); // Start of input feature map is passed as padding pointer, this is not neccessary. - float16_t* pad_buffer = feature_map.data(); + const float16_t* pad_buffer = feature_map.data(); init_indirection_table( feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols); -- GitLab From 675e27fc3192fa289de6e34cf3617faca21dc73b Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Mon, 2 Jun 2025 11:48:13 +0100 Subject: [PATCH 13/18] Add changelog note Signed-off-by: Mohammed Suhail Munshi --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3845ca8e..005bbaff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Fixes: - Address segmentation faults in benchmarking tool. - Fix clamping issues for FP16 and BF16 in testing framework. +- Added Convolution example using SME Indirect Matmul Kernels ## v1.8.0 -- GitLab From a5bf700b860c14a373d3edd5b65affb5df665ce0 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Mon, 2 Jun 2025 14:25:44 +0100 Subject: [PATCH 14/18] Change return type of function to void Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp index e5028667..28274282 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -117,7 +117,7 @@ void convolution_layer_nhwc( /// @param[in] filter_height Height of convolution filter. /// @param[in] filter_width Width of convolution filter. /// @param[in] itable_cols Number of columns in indirection table (m_step) -std::vector init_indirection_table( +void init_indirection_table( const VEC_F16& feature_map, std::vector& indirect_table, const float16_t* pad_buffer, const Shape& in_shape, const Shape& out_shape, const size_t filter_height, const size_t filter_width, const size_t itable_cols) { -- GitLab From 24b5221df1f107ada7e145188a4df1396c117de2 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 3 Jun 2025 10:29:25 +0100 Subject: [PATCH 15/18] Remove padding buffer as it is not used. Signed-off-by: Mohammed Suhail Munshi --- CHANGELOG.md | 3 ++- .../conv2d_imatmul_clamp_f16_f16_f16p.cpp | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 005bbaff..8cb651ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo ## Upcoming Release +- Added Convolution example using SME Indirect Matmul Kernels + ## v1.9.0 - Extend support for signed 4-bit integer inputs in `kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon`. @@ -21,7 +23,6 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Fixes: - Address segmentation faults in benchmarking tool. - Fix clamping issues for FP16 and BF16 in testing framework. -- Added Convolution example using SME Indirect Matmul Kernels ## v1.8.0 diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp index 28274282..bf47c80d 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/conv2d_imatmul_clamp_f16_f16_f16p.cpp @@ -165,8 +165,6 @@ void init_indirection_table( } } } - - return indirect_table; } /// Fills the matrix with incremental values according to the provided weight. @@ -279,10 +277,10 @@ int main() { const size_t itable_rows = k_chunk_count * round_up_division(out_nhw_size, itable_cols); std::vector indirect_table(itable_cols * itable_rows); - // Start of input feature map is passed as padding pointer, this is not neccessary. - const float16_t* pad_buffer = feature_map.data(); + // Padding buffer 'pad_buffer' is set to nullptr as there is no padding in this example. + // Shapes specified are such that no padding should be needed. init_indirection_table( - feature_map, indirect_table, pad_buffer, in_shape, out_shape, filter_height, filter_width, itable_cols); + feature_map, indirect_table, nullptr, in_shape, out_shape, filter_height, filter_width, itable_cols); // ------------------------------------------------- // 2. Pack LHS and RHS. @@ -295,7 +293,7 @@ int main() { VEC_F16 packed_lhs(lhs_packed_size); VEC_F16 packed_rhs(rhs_packed_size); - // Padding is not used in the indirection buffer (as padding is valid), therefore pad_ptr is nullptr + // Padding is not used in the indirection buffer, therefore pad_ptr is nullptr // Ptr offset is provided as 0 as it is not needed to apply an offset to each valid pointer provided in the table in // this case. kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme( -- GitLab From 601ee7faa0c83537613075414bb0e7bbc9afc107 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 3 Jun 2025 12:43:16 +0100 Subject: [PATCH 16/18] Enable building but not running SME Examples in CI Signed-off-by: Mohammed Suhail Munshi --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 23442725..9a0ab422 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -119,16 +119,17 @@ build-examples: - .standard-rules script: - mkdir -p build + # Examples are built, but SME examples are not copied to prevent running on unsupported systems. - > for EXAMPLE in `ls examples -1`; do if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then - [[ $EXAMPLE == *sme* ]] && continue echo "-----------------------------------------------------------" echo "Build examples/${EXAMPLE}" echo "-----------------------------------------------------------" mkdir -p build_${EXAMPLE} cmake -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_C_FLAGS="-Werror" -DCMAKE_BUILD_TYPE=Release -S examples/$EXAMPLE -B build_${EXAMPLE} cmake --build build_${EXAMPLE} -j${PARALLEL_JOBS} --verbose + [[ $EXAMPLE == *sme* ]] && continue cp build_${EXAMPLE}/${EXAMPLE} build/ else echo "No build file found for ${EXAMPLE}" -- GitLab From 13af4f8b601c042edb5ac5e0607acb334ce6aefd Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 3 Jun 2025 13:31:51 +0100 Subject: [PATCH 17/18] Minor change Signed-off-by: Mohammed Suhail Munshi --- .gitlab-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a0ab422..9ca03ac8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -119,7 +119,6 @@ build-examples: - .standard-rules script: - mkdir -p build - # Examples are built, but SME examples are not copied to prevent running on unsupported systems. - > for EXAMPLE in `ls examples -1`; do if [ -f examples/${EXAMPLE}/CMakeLists.txt ]; then @@ -129,7 +128,6 @@ build-examples: mkdir -p build_${EXAMPLE} cmake -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS="-Werror" -DCMAKE_C_FLAGS="-Werror" -DCMAKE_BUILD_TYPE=Release -S examples/$EXAMPLE -B build_${EXAMPLE} cmake --build build_${EXAMPLE} -j${PARALLEL_JOBS} --verbose - [[ $EXAMPLE == *sme* ]] && continue cp build_${EXAMPLE}/${EXAMPLE} build/ else echo "No build file found for ${EXAMPLE}" @@ -153,6 +151,7 @@ test-examples: echo "-----------------------------------------------------------" echo "Run ${EXAMPLE}" echo "-----------------------------------------------------------" + [[ $EXAMPLE == *sme* ]] && continue build/${EXAMPLE} | tee -a example_${EXAMPLE}.log done artifacts: -- GitLab From aed6c82581a9d394ff7ded35be6e971d1ec80cf7 Mon Sep 17 00:00:00 2001 From: Mohammed Suhail Munshi Date: Tue, 3 Jun 2025 13:49:18 +0100 Subject: [PATCH 18/18] Fix built binary name Signed-off-by: Mohammed Suhail Munshi --- .../conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt index aef754c9..a9499b84 100644 --- a/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt +++ b/examples/conv2d_imatmul_clamp_f16_f16_f16p_sme2/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.16) -project(conv2d_imatmul_clamp_f16_f16_f16p) +project(conv2d_imatmul_clamp_f16_f16_f16p_sme2) set(CMAKE_CXX_STANDARD 17) set(KAI_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../) @@ -21,13 +21,13 @@ set(KAI_SOURCES # Files requires to build the executable add_executable( - conv2d_imatmul_clamp_f16_f16_f16p conv2d_imatmul_clamp_f16_f16_f16p.cpp + conv2d_imatmul_clamp_f16_f16_f16p_sme2 conv2d_imatmul_clamp_f16_f16_f16p.cpp ${KAI_SOURCES}) -target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p +target_compile_options(conv2d_imatmul_clamp_f16_f16_f16p_sme2 PRIVATE "-march=armv8.2-a+sve+sve2;-fno-tree-vectorize" ) -target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p +target_compile_definitions(conv2d_imatmul_clamp_f16_f16_f16p_sme2 PRIVATE $<$:KAI_DEBUG> ) -- GitLab