From 4aa9fb031d6fcdab8cc13b5c8b09a08df0639da7 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Thu, 3 Apr 2025 13:58:41 +0100 Subject: [PATCH 1/2] Rename kai_lhs_quant_pack_qsi8d32p_f32_neon to kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon Signed-off-by: Anitha Raj --- CHANGELOG.md | 3 +++ CMakeLists.txt | 2 +- kai/ukernels/matmul/BUILD.bazel | 2 +- ...qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h | 4 ++-- ...f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h | 4 ++-- ... => kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c} | 14 +++++++------- ... => kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h} | 12 ++++++------ .../matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 8 ++++---- 8 files changed, 26 insertions(+), 23 deletions(-) rename kai/ukernels/matmul/pack/{kai_lhs_quant_pack_qsi8d32p_f32_neon.c => kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c} (87%) rename kai/ukernels/matmul/pack/{kai_lhs_quant_pack_qsi8d32p_f32_neon.h => kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h} (88%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1247ef29..8fe632af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo ## Upcoming Release +- Breaking changes: + - Rename kai_lhs_quant_pack_qsi8d32p_f32_neon to kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon + ## v1.6.0 - Add CMake installation and `find_package()` support. diff --git a/CMakeLists.txt b/CMakeLists.txt index 53ce829f..73bb83c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,7 +118,7 @@ set(KLEIDIAI_FILES_NEON_FP16_BF16 set(KLEIDIAI_FILES_NEON kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S - kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c + kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 3796d88d..7b656c49 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -28,7 +28,7 @@ SCALAR_KERNELS = [ # buildifier: keep sorted NEON_KERNELS = [ - "pack/kai_lhs_quant_pack_qsi8d32p_f32_neon", + "pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon", "pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon", "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0", "pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon", diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h index 5c7e6e05..4ecff7cb 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step. +/// -# @ref kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon to dynamically quantize and pack the LHS matrix in a single step. /// -# @ref kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon to pack the RHS NxK matrix. /// -------------------------------------------------- diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h index 990fa145..ec0da8f1 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step. +/// -# @ref kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon to dynamically quantize and pack the LHS matrix in a single step. /// -# @ref kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon to pack the RHS NxK matrix. /// -------------------------------------------------- diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c similarity index 87% rename from kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c rename to kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c index 70298754..1bba9838 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,7 +8,7 @@ #error This file must be compiled for AArch64. #else // Architectural features check. -#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h" +#include "kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h" #include #include @@ -32,16 +32,16 @@ inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_ return mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block(bl); } -size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32_neon(size_t mr) { +size_t kai_get_m_step_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t mr) { KAI_UNUSED(mr); return 1; } -size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t lhs_stride) { +size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t m_idx, size_t lhs_stride) { return m_idx * lhs_stride; } -size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( +size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) { KAI_ASSUME((k % 2) == 0); KAI_ASSUME((k % kr) == 0); @@ -55,7 +55,7 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, bl); } -size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( +size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) { KAI_ASSUME((k % 2) == 0); KAI_ASSUME((k % kr) == 0); @@ -69,7 +69,7 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( return (num_rows * kai_lhs_packed_stride(k, mr, kr, bl)); } -void kai_run_lhs_quant_pack_qsi8d32p_f32_neon( +void kai_run_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed) { KAI_ASSUME((bl % kr) == 0); diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h similarity index 88% rename from kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h rename to kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h index 837d95bd..3e28ec99 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -18,7 +18,7 @@ extern "C" { /// @param[in] mr The number of M rows to interleave on the same output row. /// /// @return the m step value -size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32_neon(size_t mr); +size_t kai_get_m_step_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t mr); /// Gets the offset in bytes for the LHS matrix (not packed) /// @@ -28,7 +28,7 @@ size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32_neon(size_t mr); /// @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed) /// /// @return the offset in bytes to the LHS matrix -size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t lhs_stride); +size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t m_idx, size_t lhs_stride); /// Gets the offset in bytes for the packed LHS matrix, /// which contains the packed 8-bit quantized symmetric per-block (qsi8d32) values. @@ -43,7 +43,7 @@ size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. /// /// @return the offset in bytes to the packed LHS matrix -size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( +size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); /// Gets the size in bytes for the quantized and packed LHS matrix @@ -57,7 +57,7 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. /// /// @return the packed LHS matrix size in bytes -size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( +size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); /// Run the micro-kernel to quantize and pack the LHS matrix. @@ -74,7 +74,7 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( /// @param[in] lhs LHS matrix. /// @param[in] lhs_stride Stride in bytes between two rows of LHS. /// @param[out] lhs_packed The quantized and packed LHS matrix. -void kai_run_lhs_quant_pack_qsi8d32p_f32_neon( +void kai_run_lhs_quant_pack_qsi8d32pmrx4_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed); diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp index 153decb5..4c24abf6 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp @@ -25,7 +25,7 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h" -#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h" +#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h" #include "test/common/cpu_info.hpp" @@ -86,10 +86,10 @@ static const std::array< clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa, cpu_has_sme2, lhs_quant_pack_qsi8d32p_f32_neon, - rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon), + clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa, cpu_has_sme2, + lhs_quant_pack_qsi8d32pmrx4_f32_neon, rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, cpu_has_sme2, lhs_quant_pack_qsi8d32p_f32_neon, + clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, cpu_has_sme2, lhs_quant_pack_qsi8d32pmrx4_f32_neon, rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon)}}; using MatMulTestParams_withPortion = std::tuple; -- GitLab From bb1a4a7b1daf628f1fe8547037994c1f9727db74 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Fri, 4 Apr 2025 12:31:21 +0100 Subject: [PATCH 2/2] Rename kai_lhs_quant_pack_qsi8d32p_f32 to kai_lhs_quant_pack_qsi8d32p_f32_neon - Adding architectural header guard and renaming kai_lhs_quant_pack_qsi8d32p_f32 to kai_lhs_quant_pack_qsi8d32p_f32_neon to represent the optimization used. Signed-off-by: Anitha Raj --- CHANGELOG.md | 1 + CMakeLists.txt | 16 +++++++-------- .../CMakeLists.txt | 4 ++-- .../matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp | 8 ++++---- kai/ukernels/matmul/BUILD.bazel | 2 +- ...qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h | 4 ++-- ...8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h | 7 +++---- ...si8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h | 4 ++-- ...2_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h | 6 +++--- ...qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h | 6 +++--- kai/ukernels/matmul/pack/README.md | 4 ++-- ...=> kai_lhs_quant_pack_qsi8d32p_f32_neon.c} | 20 ++++++++++++------- ...=> kai_lhs_quant_pack_qsi8d32p_f32_neon.h} | 12 +++++------ ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 20 +++++++++---------- 14 files changed, 60 insertions(+), 54 deletions(-) rename kai/ukernels/matmul/pack/{kai_lhs_quant_pack_qsi8d32p_f32.c => kai_lhs_quant_pack_qsi8d32p_f32_neon.c} (84%) rename kai/ukernels/matmul/pack/{kai_lhs_quant_pack_qsi8d32p_f32.h => kai_lhs_quant_pack_qsi8d32p_f32_neon.h} (88%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fe632af..897a124a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Breaking changes: - Rename kai_lhs_quant_pack_qsi8d32p_f32_neon to kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon + - Rename kai_lhs_quant_pack_qsi8d32p_f32 to kai_lhs_quant_pack_qsi8d32p_f32_neon ## v1.6.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 73bb83c6..6c4bb5f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,37 +87,37 @@ endif() set(KLEIDIAI_FILES_SCALAR kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c - kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c -) + ) -set(KLEIDIAI_FILES_NEON_FP16 + set(KLEIDIAI_FILES_NEON_FP16 kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c -) + ) -set(KLEIDIAI_FILES_NEON_BF16 + set(KLEIDIAI_FILES_NEON_BF16 kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.c kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c -) + ) -set(KLEIDIAI_FILES_NEON_FP16_BF16 + set(KLEIDIAI_FILES_NEON_FP16_BF16 kai/ukernels/matmul/matmul_clamp_f16_bf16p_bf16p/kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla.c kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c -) + ) set(KLEIDIAI_FILES_NEON kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S + kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.c diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt index 8159b1c3..528bd550 100644 --- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt +++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -24,7 +24,7 @@ include_directories( add_executable(matmul_clamp_f32_qsi8d32p_qsi4c32p matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c - ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c + ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32_neon.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c) diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp index 6d992b62..445ec50e 100644 --- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp +++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -15,7 +15,7 @@ #include // Include micro-kernel variants -#include "kai_lhs_quant_pack_qsi8d32p_f32.h" +#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h" #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h" #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h" @@ -349,7 +349,7 @@ int main(int argc, char** argv) { const size_t sr = ukernel_variants[idx_variant].ukernel.get_sr(); // Get the size in bytes for the packed matrices - const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr); + const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon(m, k, bl, mr, kr, sr); const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl); const size_t dst_size = ukernel_variants[idx_variant].ukernel.get_dst_size(m, n); @@ -378,7 +378,7 @@ int main(int argc, char** argv) { const auto time_s = std::chrono::high_resolution_clock::now(); // LHS packing - kai_run_lhs_quant_pack_qsi8d32p_f32( + kai_run_lhs_quant_pack_qsi8d32p_f32_neon( m, k, bl, // Dimensions mr, kr, sr, 0, // Packing arguments (const float*)lhs_native_mtx_f32, // LHS diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 7b656c49..93f4aed5 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -21,13 +21,13 @@ package(default_visibility = ["//visibility:private"]) # buildifier: keep sorted SCALAR_KERNELS = [ "pack/kai_lhs_quant_pack_qai8dxp_f32", - "pack/kai_lhs_quant_pack_qsi8d32p_f32", "pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0", "pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0", ] # buildifier: keep sorted NEON_KERNELS = [ + "pack/kai_lhs_quant_pack_qsi8d32p_f32_neon", "pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon", "pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon", "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0", diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h index 44c51a72..85b50256 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix in a single step. +/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step. /// -# @ref kai_rhs_pack_nxk_qsi4c32pscalef16qsu4c32s16s0 to pack the RHS NxK matrix. /// -------------------------------------------------- diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h index e7f4d70d..48b09c32 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h @@ -1,6 +1,5 @@ - // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +13,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix /// -------------------------------------------------- @@ -116,7 +115,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dot /// @param[in] bl Block length. It must be 32. /// @param[in] lhs_packed The LHS packed matrix. /// When the activation are dynamically quantized, you can obtain this matrix -/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs +/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs /// both the dynamic quantization to 8-bit and activation packing in a single step. /// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h index 481d69ab..358360c9 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix in a single step. +/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step. /// -# @ref kai_rhs_pack_nxk_qsi4c32pscalef16qsu4c32s16s0 to pack the RHS NxK matrix. /// -------------------------------------------------- diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h index 593e4600..09300673 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h @@ -1,6 +1,6 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix /// -------------------------------------------------- @@ -117,7 +117,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm( /// @param[in] bl Block length. It must be 32. /// @param[in] lhs_packed The LHS packed matrix. /// When the activation are dynamically quantized, you can obtain this matrix -/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs +/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs /// both the dynamic quantization to 8-bit and activation packing in a single step. /// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h index 53c5c933..b5b1cc7e 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h @@ -1,6 +1,6 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -14,7 +14,7 @@ extern "C" { /// Micro-kernel dependencies /// -/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix /// -------------------------------------------------- @@ -117,7 +117,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8m /// @param[in] bl Block length. It must be 32. /// @param[in] lhs_packed The LHS packed matrix. /// When the activation are dynamically quantized, you can obtain this matrix -/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs +/// by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs /// both the dynamic quantization to 8-bit and activation packing in a single step. /// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 diff --git a/kai/ukernels/matmul/pack/README.md b/kai/ukernels/matmul/pack/README.md index 950a69ac..da5ea748 100644 --- a/kai/ukernels/matmul/pack/README.md +++ b/kai/ukernels/matmul/pack/README.md @@ -1,5 +1,5 @@ @@ -48,7 +48,7 @@ For optimal cache utilization, the operands are packed for the matmul operations These packing routines are used with LHS operand of the matmul. It quantizes the input to int8 and packs them along with their scale (and offset values in asymmetric quantization) in the destination matrix. -#### kai_run_lhs_quant_pack_qsi8d32p_f32() +#### kai_run_lhs_quant_pack_qsi8d32p_f32_neon() Quantize and pack LHS matrix with per-block quantization parameters. diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c similarity index 84% rename from kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c rename to kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c index a75daf27..07c6355f 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c @@ -1,9 +1,14 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // -#include "kai_lhs_quant_pack_qsi8d32p_f32.h" + +#if !defined(__aarch64__) +#error This file must be compiled for AArch64. +#else // Architectural features check. + +#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h" #include #include @@ -26,16 +31,16 @@ inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_ return mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block(bl); } -size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr) { +size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32_neon(size_t mr) { KAI_UNUSED(mr); return 1; } -size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_stride) { +size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t lhs_stride) { return m_idx * lhs_stride; } -size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32( +size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) { KAI_ASSUME((k % 2) == 0); KAI_ASSUME((k % kr) == 0); @@ -47,7 +52,7 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32( return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, bl); } -size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32( +size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) { KAI_ASSUME((k % 2) == 0); KAI_ASSUME((k % kr) == 0); @@ -61,7 +66,7 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32( return num_rows * kai_lhs_packed_stride(k, mr, kr, bl); } -void kai_run_lhs_quant_pack_qsi8d32p_f32( +void kai_run_lhs_quant_pack_qsi8d32p_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed) { if (m == 0) { @@ -122,3 +127,4 @@ void kai_run_lhs_quant_pack_qsi8d32p_f32( } } } +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h similarity index 88% rename from kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h rename to kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h index 2baef93f..d5dac6d1 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -19,7 +19,7 @@ extern "C" { /// @param[in] mr The number of M rows to interleave on the same output row. /// /// @return the m step value -size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr); +size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32_neon(size_t mr); /// Gets the offset in bytes for the LHS matrix (not packed) /// @@ -29,7 +29,7 @@ size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr); /// @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed) /// /// @return the offset in bytes to the LHS matrix -size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_stride); +size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t lhs_stride); /// Gets the offset in bytes for the packed LHS matrix, /// which contains the packed 8-bit quantized symmetric per-block (qsi8d32) values. @@ -44,7 +44,7 @@ size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_s /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. /// /// @return the offset in bytes to the packed LHS matrix -size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32( +size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon( size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); /// Gets the size in bytes for the quantized and packed LHS matrix @@ -58,7 +58,7 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32( /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. /// /// @return the packed LHS matrix size in bytes -size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32( +size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr); /// Run the micro-kernel to quantize and pack the LHS matrix. @@ -75,7 +75,7 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32( /// @param[in] lhs LHS matrix. /// @param[in] lhs_stride Stride in bytes between two rows of LHS. /// @param[out] lhs_packed The quantized and packed LHS matrix. -void kai_run_lhs_quant_pack_qsi8d32p_f32( +void kai_run_lhs_quant_pack_qsi8d32p_f32_neon( size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed); diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp index 4c24abf6..9da7e571 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp @@ -24,7 +24,7 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h" -#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h" +#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h" @@ -44,14 +44,14 @@ namespace kai::test { // Interface for the LHS and RHS packed size and packing functions -using kai_get_lhs_packed_size_func_t = decltype(&kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32); +using kai_get_lhs_packed_size_func_t = decltype(&kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon); using kai_get_rhs_packed_size_func_t = decltype(&kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); -using kai_get_lhs_packed_offset_func_t = decltype(&kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32); +using kai_get_lhs_packed_offset_func_t = decltype(&kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon); using kai_get_rhs_packed_offset_func_t = decltype(&kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); -using kai_get_lhs_offset_func_t = decltype(&kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32); +using kai_get_lhs_offset_func_t = decltype(&kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon); using kai_get_rhs_offset_func_t = decltype(&kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); -using kai_run_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32); +using kai_run_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32_neon); using kai_run_rhs_pack_func_t = decltype(&kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); // Micro-kernel interface @@ -71,19 +71,19 @@ static const std::array< 7> variants_kai_matmul_clamp_f32_qsi8d32p_qsi4c32p = { {UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32, + clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32_neon, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32, + clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32_neon, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32, + clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32, + clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( - clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32, + clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon, rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0), UKERNEL_MATMUL_PACK_VARIANT( clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa, cpu_has_sme2, -- GitLab