diff --git a/examples/matmul_tiling/CMakeLists.txt b/examples/matmul_tiling/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f86c2b745aef7d6e57603a6953cc978c751a073 --- /dev/null +++ b/examples/matmul_tiling/CMakeLists.txt @@ -0,0 +1,33 @@ +# +# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# + +cmake_minimum_required(VERSION 3.16) + +project(kai_example_matmul_tiling) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(KLEIDIAI_PATH ../../) +set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/) +set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/) + +# KleidiAI include directories +include_directories( + ${KLEIDIAI_PATH} + ${MATMUL_PACK_PATH} + ${MATMUL_PATH}) + +# Files requires to build the executable +add_executable(matmul_tiling + matmul_tiling.cpp + ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c + ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qai8dxp_f32.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c) + +target_compile_options(matmul_tiling + PRIVATE -march=armv8.2-a+dotprod +) diff --git a/examples/matmul_tiling/matmul_tiling.cpp b/examples/matmul_tiling/matmul_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e9829063e3d935d777d68f167e4daab35369d2b4 --- /dev/null +++ b/examples/matmul_tiling/matmul_tiling.cpp @@ -0,0 +1,528 @@ +// +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +#if !defined(__ARM_FEATURE_DOTPROD) +#error "Dotprod and I8mm extensions required to compile this example" +#else +#include +#include +#include +#include +#include +#include +#include +#include + +// Include micro-kernel variants +#include "kai_lhs_quant_pack_qai8dxp_f32.h" +#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" +#include "kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h" + +#define INT4_MIN (-8) +#define INT4_MAX (7) + +enum class exec_type { + non_tiled = 0, + tiled = 1, +}; + +struct mnk { + size_t m = 0; + size_t n = 0; + size_t k = 0; +}; + +inline long time_in_ms() { + using namespace std::chrono; + auto now = time_point_cast(steady_clock::now()); + return now.time_since_epoch().count(); +} + +mnk matmul_shapes[] = {{1024, 1024, 1024}}; + +size_t exec_times_ms[2]; + +// Micro-kernel interface +struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp { + kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel ukernel; + std::string name = {}; +}; + +const kai_matmul_ukernel_f32_qa8dxp_qs4cxp ukernel_variants[] = { + {kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod, + "matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod"}, +}; + +#define VEC_MAT_VARIANT 0 +#define MAT_MAT_VARIANT 1 + +// Number of micro-kernel variants stored in the array +const size_t num_ukernel_variants = sizeof(ukernel_variants) / sizeof(ukernel_variants[0]); + +static size_t roundup(size_t a, size_t b) { + return ((a + b - 1) / b) * b; +} + +static void fill_uniform_random(size_t num_rows, size_t num_cols, float* dst, size_t seed) { + std::srand(seed); + + // Fill the array with random values between -1 and 1 + for (int i = 0; i < num_rows * num_cols; i++) { + dst[i] = (float)((double)std::rand() / RAND_MAX) * 2 - 1; + } +} + +static void ref_quant_qa8dx_f32(size_t m, size_t k, const float* lhs_f32, int8_t* lhs_qa8dx) { + const size_t dst_stride = (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t)); + + const size_t lhs_qa8dx_stride = k; + + for (size_t m_idx = 0; m_idx < m; ++m_idx) { + const float* src_ptr = lhs_f32 + m_idx * lhs_qa8dx_stride; + + float max0 = -FLT_MAX; + float min0 = FLT_MAX; + + // Find min/max for each channel + for (size_t k_idx = 0; k_idx < k; ++k_idx) { + const float src0_0 = src_ptr[k_idx]; + + max0 = std::max(src0_0, max0); + min0 = std::min(src0_0, min0); + } + + // Maximum/minimum int8 values + const float qmin = (float)INT8_MIN; + const float qmax = (float)INT8_MAX; + + const float rmin0 = std::min(0.0f, min0); + const float rmax0 = std::max(0.0f, max0); + + const float scale0 = rmin0 == rmax0 ? 1.f : (qmax - qmin) / (rmax0 - rmin0); + + // Reciprocal to quantize + const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f; + + const float descaled_min0 = rmin0 * scale0; + const float descaled_max0 = rmax0 * scale0; + + const float zero_point_from_min_error0 = qmin + descaled_min0; + const float zero_point_from_max_error0 = qmax + descaled_max0; + + float zero_point0 = + zero_point_from_min_error0 + zero_point_from_max_error0 > 0 ? qmin - descaled_min0 : qmax - descaled_max0; + + zero_point0 = std::max(zero_point0, qmin); + zero_point0 = std::min(zero_point0, qmax); + + // Round to nearest integer + const int32_t nudged_zero_point0 = lrintf(zero_point0); + + int8_t* dst0_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride; + + // LHS offset at the beginning of the row + *((float*)(dst0_ptr)) = recip_scale0; + dst0_ptr += sizeof(float); + *((int32_t*)(dst0_ptr)) = -nudged_zero_point0; + dst0_ptr += sizeof(int32_t); + + // Quantize the channels + for (size_t k_idx = 0; k_idx < k; ++k_idx) { + const float src0_0 = src_ptr[k_idx]; + + // Scale the values + int32_t v0_s32 = (int32_t)(round(src0_0 * scale0)); + + v0_s32 = v0_s32 + nudged_zero_point0; + v0_s32 = std::max(v0_s32, INT8_MIN); + v0_s32 = std::min(v0_s32, INT8_MAX); + dst0_ptr[0] = (int8_t)v0_s32; + dst0_ptr += sizeof(int8_t); + } + } +}; + +static void ref_quant_nxk_qs4cx_f32(size_t n, size_t k, const float* rhs_f32, uint8_t* rhs_qs4cx, float* rhs_scales_f32) { + const size_t rhs_qs4cx_stride = (roundup(k, 2) / 2); + + // Make sure the output is filled with zeros + std::memset(rhs_qs4cx, 0, n * rhs_qs4cx_stride); + + for (size_t n_idx = 0; n_idx < n; ++n_idx) { + const float* src_ptr = rhs_f32 + n_idx * k; + + float max0 = -FLT_MAX; + float min0 = FLT_MAX; + + // Find min/max for each channel + for (size_t k_idx = 0; k_idx < k; ++k_idx) { + const float src0_0 = src_ptr[k_idx]; + + max0 = std::max(src0_0, max0); + min0 = std::min(src0_0, min0); + } + + // Maximum/minimum int8 values + const float qmin = (float)INT4_MIN; + const float qmax = (float)INT4_MAX; + + const float rmin0 = std::min(0.0f, min0); + const float rmax0 = std::max(0.0f, max0); + + const float scale0 = rmin0 == rmax0 ? 1.f : (qmax - qmin) / (rmax0 - rmin0); + + // Reciprocal to quantize + const float recip_scale0 = scale0 ? 1.0f / scale0 : 0.0f; + + // Quantize the channels + for (size_t k_idx = 0; k_idx < k; ++k_idx) { + const float src0_0 = src_ptr[k_idx]; + + // Scale the values + int32_t v0_s32 = (int32_t)(round(src0_0 * scale0)); + + // Maximum/minimum int4 values + v0_s32 = std::max(v0_s32, INT4_MIN); + v0_s32 = std::min(v0_s32, INT4_MAX); + + const uint8_t v0_u8 = (uint8_t)(v0_s32 + 8); + + const size_t dst_addr = (k_idx / 2) + n_idx * rhs_qs4cx_stride; + uint8_t rhs_v0 = rhs_qs4cx[dst_addr]; + + if ((k_idx % 2) == 0) { + rhs_v0 |= v0_u8; + } else { + rhs_v0 |= (v0_u8 << 4); + } + rhs_qs4cx[dst_addr] = rhs_v0; + } + + rhs_scales_f32[n_idx] = recip_scale0; + } +}; + +static void ref_matmul_mxn_mxk_nxk_f32_qa8dx_qs4cx( + size_t m, size_t n, size_t k, const int8_t* lhs_qa8dx, const uint8_t* rhs_qs4cx, const float* rhs_scales_f32, + float* dst_f32, float scalar_min, float scalar_max) { + const size_t lhs_stride = k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t); + + const size_t rhs_qs4cx_stride = (roundup(k, 2) / 2); + + for (size_t m_idx = 0; m_idx < m; ++m_idx) { + const int8_t* lhs_ptr_start = lhs_qa8dx + m_idx * lhs_stride; + + for (size_t n_idx = 0; n_idx < n; ++n_idx) { + // Main f32 accumulator + int32_t iacc = 0; + + const int8_t* lhs_ptr = lhs_ptr_start; + const uint8_t* rhs_ptr = rhs_qs4cx + n_idx * rhs_qs4cx_stride; + + // Get the LHS quantization parameters stored at the + // beginning of each row + const float lhs_scale = *(const float*)lhs_ptr; + lhs_ptr += sizeof(float); + + const int32_t lhs_offset = *(const int32_t*)lhs_ptr; + lhs_ptr += sizeof(int32_t); + + for (size_t k_idx = 0; k_idx < k; ++k_idx) { + // Get the LHS values + const int32_t lhs_v0 = (int32_t)lhs_ptr[0]; + + // Get the RHS values + const uint8_t rhs_byte = rhs_ptr[0]; + + // Unpack the RHS values + int32_t rhs_v0 = 0; + if ((k_idx % 2) == 0) { + rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8); + } else { + rhs_v0 = (((int32_t)(rhs_byte >> 4)) - 8); + } + + iacc += lhs_v0 * rhs_v0; + iacc += lhs_offset * rhs_v0; + + lhs_ptr += 1; + + // Increment only when k_idx is not a multiple of 2 + rhs_ptr += k_idx % 2; + } + + // Get the RHS scale + const float rhs_scale = rhs_scales_f32[n_idx]; + + float main_acc = iacc * rhs_scale; + + main_acc = main_acc * lhs_scale; + + // Clamp (min-max) operation + main_acc = std::max(main_acc, scalar_min); + main_acc = std::min(main_acc, scalar_max); + + dst_f32[0] = main_acc; + dst_f32 += 1; + } + } +}; + +static void ref_add_f32_f32_f32( + const float* lhs, const float* rhs, float* dst, size_t m, size_t n) { + + for (size_t m_idx = 0; m_idx < m; ++m_idx) { + for (size_t n_idx = 0; n_idx < n; ++n_idx) { + const size_t off = (m_idx * n) + n_idx; + dst[off] = (rhs[off] + rhs[off]); + } + } +}; + +static bool is_output_correct(size_t num_rows, size_t num_cols, float tolerance, const float* ref, const float* act) { + bool is_valid = true; + + for (size_t i = 0; i < num_rows * num_cols; ++i) { + if (std::fabs(ref[i] - act[i]) > tolerance) { + const size_t x = i % num_cols; + const size_t y = i / num_cols; + printf("ERROR![%ld][%ld]: ref=%.5f vs. act=%.5f\n", y, x, ref[i], act[i]); + is_valid = false; + return is_valid; + } + } + return is_valid; +} + +int main(int argc, char** argv) { + const size_t seed_lhs = 4568; + const size_t seed_rhs = seed_lhs + 4; + const size_t seed_bia = seed_lhs + 21; + const size_t num_shapes = std::size(matmul_shapes); + + std::cout << "------------" << std::endl; + for (size_t test_idx = 0; test_idx < num_shapes; ++test_idx) { + size_t m = matmul_shapes[test_idx].m; + size_t n = matmul_shapes[test_idx].n; + size_t k = matmul_shapes[test_idx].k; + + std::cout << "\nTEST[" << m << ", " << n << ", " << k << "]" + << "\n"; + + const size_t lhs_native_size_f32 = m * k * sizeof(float); + const size_t rhs_native_size_f32 = n * k * sizeof(float); + const size_t rhs_native_size_qs4cx = n * (roundup(k, 2) / 2) * sizeof(uint8_t); + const size_t rhs_scales_size_f32 = n * sizeof(float); + const size_t bia_size_f32 = m * n * sizeof(float); + + // Allocate the memory + uint8_t* lhs_native_mtx_f32 = new uint8_t[lhs_native_size_f32]; + uint8_t* rhs_native_mtx_f32 = new uint8_t[rhs_native_size_f32]; + uint8_t* rhs_native_mtx_qs4cx = new uint8_t[rhs_native_size_qs4cx]; + uint8_t* rhs_scales_f32 = new uint8_t[rhs_scales_size_f32]; + uint8_t* bia_mtx_f32 = new uint8_t[bia_size_f32]; + +#if defined(DEBUG) + std::cout << "Preparing the reference implementation..." << std::endl; + fill_uniform_random(m, k, (float*)lhs_native_mtx_f32, seed_lhs); + fill_uniform_random(n, k, (float*)rhs_native_mtx_f32, seed_rhs); + fill_uniform_random(m, n, (float*)bia_mtx_f32, seed_bia); + + ref_quant_nxk_qs4cx_f32( + n, k, (const float*)rhs_native_mtx_f32, (uint8_t*)rhs_native_mtx_qs4cx, (float*)rhs_scales_f32); +#endif + + delete[] rhs_native_mtx_f32; + + //----------- REFERENCE IMPLEMENTATION + //------------------------------------ + //------------------------------------ + // Memory sizes for the reference implementation + // After dynamically quantized the LHS matrix, we have the scale and offset for each + // row. The scale (f32) and offset (int32) are stored at the beginning of each row + const size_t lhs_ref_size_qa8dx = m * (k + sizeof(int32_t) + sizeof(float)); + const size_t dst_ref_size_f32 = m * n * sizeof(float); + + uint8_t* lhs_ref_mtx_qa8dx = new uint8_t[lhs_ref_size_qa8dx]; + uint8_t* dst0_ref_mtx_f32 = new uint8_t[dst_ref_size_f32]; + uint8_t* dst1_ref_mtx_f32 = new uint8_t[dst_ref_size_f32]; + +#if defined(DEBUG) + ref_quant_qa8dx_f32(m, k, (const float*)lhs_native_mtx_f32, (int8_t*)lhs_ref_mtx_qa8dx); + + ref_matmul_mxn_mxk_nxk_f32_qa8dx_qs4cx( + m, n, k, (const int8_t*)lhs_ref_mtx_qa8dx, (const uint8_t*)rhs_native_mtx_qs4cx, + (const float*)rhs_scales_f32, (float*)dst0_ref_mtx_f32, -FLT_MAX, FLT_MAX); + + ref_add_f32_f32_f32( + (const float* )dst0_ref_mtx_f32, + (const float* )bia_mtx_f32, + (float* )dst1_ref_mtx_f32, + m, n); +#endif // defined(DEBUG) + + // Remove the unnecessary buffer + delete[] lhs_ref_mtx_qa8dx; + + //----------- END REFERENCE IMPLEMENTATION + //------------------------------------ + //------------------------------------ + + //----------- MICRO-KERNELS TESTS + //------------------------------------ + //------------------------------------ + // Basic heuristic. If m == 1, we should call the vector-by-matrix variant. + // Otherwise, we should call the matrix-by-matrix variant + const size_t idx_variant = m == 1? VEC_MAT_VARIANT : MAT_MAT_VARIANT; + std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl; + + // Get the packing parameters + const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr(); + const size_t nr = ukernel_variants[idx_variant].ukernel.get_nr(); + const size_t kr = ukernel_variants[idx_variant].ukernel.get_kr(); + const size_t sr = ukernel_variants[idx_variant].ukernel.get_sr(); + + // Get the size in bytes for the packed matrices + const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(m, k, mr, kr, sr); + const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(n, k, nr, kr, sr); + + const size_t dst_size = ukernel_variants[idx_variant].ukernel.get_dst_size(m, n); + + // Allocate the matrices + uint8_t* lhs_packed_mtx_qa8dx = new uint8_t[lhs_packed_size]; + uint8_t* rhs_packed_mtx_qs4cx = new uint8_t[rhs_packed_size]; + uint8_t* dst0_mtx_f32 = new uint8_t[dst_size]; + uint8_t* dst1_mtx_f32 = new uint8_t[dst_size]; + + // If the RHS matrix contains constant values, the packing can be performed + // only once + struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params nxk_params; + + nxk_params.lhs_zero_point = 1; + nxk_params.rhs_zero_point = 8; + + // RHS packing + kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0( + 1, n, k, nr, kr, sr, // Packing arguments + (const uint8_t*)(rhs_native_mtx_qs4cx), // RHS + NULL, // Bias + (const float*)(rhs_scales_f32), // Scale + rhs_packed_mtx_qs4cx, // RHS packed + 0, &nxk_params); + + for (const exec_type& type : {exec_type::non_tiled, exec_type::tiled}) { + + const size_t m_step = type == exec_type::non_tiled? m : ukernel_variants[idx_variant].ukernel.get_m_step(); + + const auto start_t = time_in_ms(); + + for(size_t m_idx = 0; m_idx < m; m_idx += m_step) { + + const size_t lhs_native_stride = k * sizeof(float); + + // LHS packing + kai_run_lhs_quant_pack_qai8dxp_f32( + m_step, k, mr, kr, sr, 0, // Packing arguments + (const float*)(lhs_native_mtx_f32 + m_idx * lhs_native_stride), // LHS + lhs_native_stride, // LHS stride + lhs_packed_mtx_qa8dx); // LHS packed + + // Matmul + const size_t dst_stride = n * sizeof(float); + const size_t lhs_offset = ukernel_variants[idx_variant].ukernel.get_lhs_packed_offset(0, k); + const size_t rhs_offset = ukernel_variants[idx_variant].ukernel.get_rhs_packed_offset(0, k); + const size_t dst0_offset = ukernel_variants[idx_variant].ukernel.get_dst_offset(0, 0, dst_stride); + const size_t dst1_offset = ukernel_variants[idx_variant].ukernel.get_dst_offset(m_idx, 0, dst_stride); + + const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qa8dx + lhs_offset); + const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4cx + rhs_offset); + const float* bia_ptr = (float*)(bia_mtx_f32 + dst1_offset); + float* dst0_ptr = (float*)((uint8_t*)dst0_mtx_f32 + dst0_offset); + float* dst1_ptr = (float*)((uint8_t*)dst1_mtx_f32 + dst1_offset); + + ukernel_variants[idx_variant].ukernel.run_matmul( + m_step, n, k, // Dimensions + lhs_ptr, // LHS packed + rhs_ptr, // RHS packed + dst0_ptr, // DST + dst_stride, // DST stride (row) + sizeof(float), // DST stride (col) + -FLT_MAX, FLT_MAX // Min and max for the clamp operation + ); + + ref_add_f32_f32_f32( + (const float* )dst0_ptr, + (const float* )bia_ptr, + (float* )dst1_ptr, + m_step, n); + } + + const auto end_t = time_in_ms(); + + exec_times_ms[static_cast(type)] = end_t - start_t; + +#if defined(DEBUG) + const bool is_valid = + is_output_correct(m, n, 0.0001f, (const float*)dst1_ref_mtx_f32, (const float*)dst1_mtx_f32); + + if (is_valid) { + std::cout << "TEST[" << idx_variant << "] = PASSED" << std::endl; + } else { + std::cout << "TEST[" << idx_variant << "] = FAILED" << std::endl; + } +#endif // defined(DEBUG) + } + + const size_t non_tiled_exec_ms = exec_times_ms[static_cast(exec_type::non_tiled)]; + const size_t tiled_exec_ms = exec_times_ms[static_cast(exec_type::tiled)]; + const float r = (static_cast(non_tiled_exec_ms - tiled_exec_ms) / non_tiled_exec_ms) * 100.f; + + std::cout << "Execution time [NON-TILED]: " << non_tiled_exec_ms << " ms" << std::endl; + std::cout << "Execution time [TILED]: " << tiled_exec_ms << " ms" << std::endl; + std::cout << "% TILED faster(+) or slower (-) than NON-TILED: " << r << std::endl; + + delete[] lhs_packed_mtx_qa8dx; + delete[] rhs_packed_mtx_qs4cx; + delete[] dst0_mtx_f32; + delete[] dst1_mtx_f32; + + delete[] lhs_native_mtx_f32; + delete[] rhs_native_mtx_qs4cx; + delete[] rhs_scales_f32; + delete[] bia_mtx_f32; + delete[] dst0_ref_mtx_f32; + delete[] dst1_ref_mtx_f32; + } +} + +//----------- END MICRO-KERNELS TESTS +//------------------------------------ +//------------------------------------ + +#endif // Architectural feature check