diff --git a/CHANGELOG.md b/CHANGELOG.md index 65b5f24c310fab03c249e1de86946ff769fc4096..82394453b65ba7e81f454964179e3462b823d9fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Add MSVC support for test framework - Fixes: - Fix several CPU feature check issues affecting test framework + - Fix the LHS/RHS packed offset calculation in matmul get_offset methods ## v1.1.0 diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c index 14e69f4ce0aed2b19fcee6238f44f594fc25f3b1..6ad79c6a3015574785d9ad68eaebcd808cd08521 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -107,7 +107,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_get_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod( @@ -115,7 +115,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neo KAI_ASSUME((k % bl) == 0); KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_get_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_get_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c index 9a0670c65221e55f96f3f035f2471ae80d6f21f5..34fd180a82d4b56a66a1e049607d0a7ad09e8993 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod(vo size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod( @@ -92,7 +92,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_ KAI_ASSERT((k % bl) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c index f05c4a53d0182fbfe4f20bef2392248521a903fe..45b6fa60e795911cfc8c184f81846535c4350a21 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod(vo size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod( @@ -92,7 +92,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_ KAI_ASSERT((k % bl) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c index 4f2ba1687e90743ab84175882675252fa54331b4..5750beef3baf8a9a0ee20cea9de718ef70860a00 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -107,14 +107,14 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(void size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_get_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod( size_t n_idx, size_t k, size_t bl) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_get_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_get_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.c index cac6a53caf7c1a1c42c35a9ad38636292b5270ec..096d4511ffeb5b4e6c70c8ca279637ba692e784d 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,12 +84,12 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm(void size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm( size_t n_idx, size_t k, size_t bl) { - KAI_ASSERT((n_idx % kai_n_step) == 0); + KAI_ASSERT((n_idx % kai_nr) == 0); return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); } diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c index bda833f36ddd92369b5c98c832a37b73f0b17111..3e55af1ae702d216e510a0de6f9afe569c479b08 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm( @@ -92,7 +92,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_ KAI_ASSERT((k % bl) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.c index 756b0b47f37ad0052c45c02a39513fb2785fd191..99fb07757219f0ee6b906a2cc003da8415c9a0a4 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm( @@ -92,7 +92,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_ KAI_ASSERT((k % bl) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c index c50b2c659dedf853fb92a260502acfe1a3d7d811..217125a410196776945680313b5948a2f24b623e 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -78,13 +78,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_get_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - - return (n_idx / kai_n_step) * kai_get_rhs_packed_stride(k); + const size_t nr = kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(); + return (n_idx / nr) * kai_get_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c index 6073136afe6028934874467d4b9bb2560081907a..ab136e1a6033aa418b2d155051caf7f834bbc58a 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -93,13 +93,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_get_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_get_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_get_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_get_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c index e44a5fd945065b15ec71294a7bf1a973513ee9b6..03cf6abb0b7a18ec8cb9b44170580e91a3aefaa7 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -77,13 +77,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(voi size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c index 54d63ae74d3fae56f11914c53bccadeea8bd8266..c9122f5a5d467765c8426f899388e7c607597289 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -77,13 +77,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(voi size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.c index 9e8aec71427185cf60f0de83f432b1614f856656..2e8146609d54fff317150c3de7deefb378dbc1f8 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod(voi size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi4cxp8x4_8x8x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c index 1f24fb465f89e526dca477bcd1acb55d92ddf1fb..f899cc1d6d7f1f8908370bebec3666aadb23e6f8 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(vo size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c index ae71f09c1559fbb40f79a433348f140d7da968a2..747c8d5f340aebfaa1f22e900aa62549171c4959 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -38,8 +38,7 @@ inline static size_t kai_lhs_packed_stride(size_t k) { KAI_ASSERT((k_internal % 2) == 0); - return kai_mr * - (k_internal * sizeof(int8_t) + kai_num_bytes_multiplier_lhs + kai_num_bytes_offset_lhs + kai_num_bytes_bias); + return kai_mr * (k_internal * sizeof(int8_t) + kai_num_bytes_multiplier_lhs + kai_num_bytes_offset_lhs); } inline static size_t kai_rhs_packed_stride(size_t k) { @@ -77,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c index 7853163113d5ead9598cb9fb273bb0e2e485686a..4c6cedcd2989248153532d3403468830fd312642 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c index 51585fe0b5522c5f2b5caa64938cb053df6a4dce..fa89d5702ceb228202b97bb0c30a637a84d87990 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c index b310bf748e42e1a4de205fbaa592be00daa25492..b85b0210164ba63f22c804b9c59b202293d3643a 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -76,13 +76,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c index 5dcd0bddc4410ed74bd67feb6e608f7042461e85..bd5246fa692d7bf88fd9fc038fb34c99edab431a 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -91,13 +91,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c index bedacef6287d67208c0925c02464845c92bb14b0..784a1165aae26d228161c56b07d656a0db096b96 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -91,13 +91,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c index 1bec354b25f47e1f4628ee1e12cc7f845383ae06..4c1b4433a86d4f02adf64d1a3000c47538dec6ea 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -91,13 +91,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod(void) size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c index db5edc633059c532e48be37ac6f10351ac9d85d0..91c9dce9494024d4023b4f6e24321e24c8b69e82 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -91,13 +91,13 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm(void) { size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm(size_t m_idx, size_t k) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm(size_t n_idx, size_t k) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c index 88cf0a9a306732db292a1900b7c326d4b240b357..b1911981d3616348760e09c34587393905f4d813 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -100,17 +100,20 @@ size_t kai_get_sr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa( size_t m_idx, size_t k, size_t bl) { const size_t m_step = kai_get_m_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); + const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); KAI_ASSUME((m_idx % m_step) == 0); - return (m_idx / m_step) * kai_get_lhs_packed_stride(k, bl); + return (m_idx / mr) * kai_get_lhs_packed_stride(k, bl); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa( size_t n_idx, size_t k, size_t bl) { const size_t n_step = kai_get_n_step_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); + const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); + KAI_ASSUME((n_idx % n_step) == 0); - return (n_idx / n_step) * kai_get_rhs_packed_stride(k, bl); + return (n_idx / nr) * kai_get_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c index c66691c948297668e542e167c21d3f4c381c7d98..002b55fab6a966ca428053c082347c7befafbd71 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -100,15 +100,19 @@ size_t kai_get_sr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(voi size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot( size_t m_idx, size_t k, size_t bl) { const size_t m_step = kai_get_m_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); + const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); + KAI_ASSUME((m_idx % m_step) == 0); - return (m_idx / m_step) * kai_get_lhs_packed_stride(k, bl); + return (m_idx / mr) * kai_get_lhs_packed_stride(k, bl); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot( size_t n_idx, size_t k, size_t bl) { const size_t n_step = kai_get_n_step_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); - KAI_ASSUME((n_idx % n_step) == 0); + const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); + + KAI_ASSUME((n_idx % nr) == 0); return (n_idx / n_step) * kai_get_rhs_packed_stride(k, bl); } diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c index 6447110a7ca6784de10a0134a406f8efceaa9441..2e8affbdf92af05ec4687edd1383ca92ef4f6bc9 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -95,14 +95,14 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_ne size_t m_idx, size_t k, size_t bl) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k, bl); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k, bl); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod( size_t n_idx, size_t k, size_t bl) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c index af49c7bf28263d68a28cfe2809417a503a0623e6..69096ff732778cd4898d91568cda70d008d86b93 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c @@ -1,6 +1,6 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -83,7 +83,7 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32 KAI_ASSUME((k % bl) == 0); KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod( @@ -94,7 +94,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32 KAI_ASSUME((k % bl) == 0); KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c index 00bc1be88128cfbad1ee943c1ea2fc27ff0191e5..7d40fbbc783bd97560b74b4f355ae61bca2a30eb 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -95,14 +95,14 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_n size_t m_idx, size_t k, size_t bl) { KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k, bl); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k, bl); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod( size_t n_idx, size_t k, size_t bl) { KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k, bl); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k, bl); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c index f927501238302ff5bcd82f708b563f8ca6244430..2e891b3090940e00c747112fba344c569339ecb8 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c @@ -1,6 +1,6 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_n KAI_ASSUME((k % bl) == 0); KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm( @@ -95,7 +95,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_n KAI_ASSUME((k % bl) == 0); KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm( diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c index 8420c134924fb56187a00657865ba85b738d5a2c..df0e346566f15343acdfc46d52393b00288688ac 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c @@ -1,6 +1,6 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -84,7 +84,7 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32 KAI_ASSUME((k % bl) == 0); KAI_ASSUME((m_idx % kai_m_step) == 0); - return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); + return (m_idx / kai_mr) * kai_lhs_packed_stride(k); } size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm( @@ -95,7 +95,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32 KAI_ASSUME((k % bl) == 0); KAI_ASSUME((n_idx % kai_n_step) == 0); - return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); + return (n_idx / kai_nr) * kai_rhs_packed_stride(k); } size_t kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm( diff --git a/test/common/test_suite.hpp b/test/common/test_suite.hpp index d0c9ed9a21da37553da2d19fc6e0306172dfe59e..c20f71cc20c3def51a1e1f53731013e3d7512065 100644 --- a/test/common/test_suite.hpp +++ b/test/common/test_suite.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -27,15 +27,20 @@ kai_get_dst_offset_matmul_## name, \ kai_get_dst_size_matmul_## name, \ kai_run_matmul_## name} -// clang-format on #define UKERNEL_MATMUL_PACK_VARIANT(name, features_check, lhs_pack, rhs_pack) \ { \ - {UKERNEL_MATMUL_VARIANT(name), "kai_matmul_" #name, features_check}, { \ - kai_get_lhs_packed_size_##lhs_pack, kai_get_rhs_packed_size_##rhs_pack, kai_run_##lhs_pack, \ - kai_run_##rhs_pack \ + {UKERNEL_MATMUL_VARIANT(name), "kai_matmul_" #name, features_check}, \ + { \ + kai_get_lhs_packed_size_##lhs_pack, \ + kai_get_rhs_packed_size_##rhs_pack, \ + kai_get_lhs_packed_offset_##lhs_pack, \ + kai_get_rhs_packed_offset_##rhs_pack, \ + kai_run_##lhs_pack, \ + kai_run_##rhs_pack \ } \ } +// clang-format on namespace kai::test { diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 063ac0730454349488fe8c7060fc50dbc690a3f0..ab603be0416b8522222efcd891aab918effbaeb8 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -81,26 +81,32 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_RHS) { GTEST_SKIP(); } + const size_t M = matmul_shape.m; + const size_t N = matmul_shape.n; const size_t K = matmul_shape.k; + + auto m_step = ukernel_variant.interface.get_m_step(); + auto n_step = ukernel_variant.interface.get_n_step(); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + const auto nr = ukernel_variant.interface.get_nr(); const auto kr = ukernel_variant.interface.get_kr(); const auto sr = ukernel_variant.interface.get_sr(); kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - auto n_step = ukernel_variant.interface.get_n_step(); - auto a_tmp = ukernel_variant.interface.get_rhs_packed_offset(n_step, K, bl) / n_step; - auto b_tmp = ukernel_variant.interface.get_rhs_packed_offset(n_step * 16, K, bl) / (n_step * 16); - - ASSERT_EQ(a_tmp, b_tmp); - + const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset_kxn = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(n_step, K, nr, kr, sr, bl, scale_dt); + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); auto rhs_packed_offset_nxk = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(n_step, K, nr, kr, sr, bl, scale_dt); + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); ASSERT_EQ(rhs_packed_offset_kxn, rhs_packed_offset_nxk); - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(n_step, K, bl); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); ASSERT_EQ(rhs_packed_offset_kxn, rhs_matmul_offset); } @@ -112,13 +118,27 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_LHS) { GTEST_SKIP(); } + const size_t M = matmul_shape.m; + const size_t N = matmul_shape.n; const size_t K = matmul_shape.k; auto m_step = ukernel_variant.interface.get_m_step(); - auto a_tmp = ukernel_variant.interface.get_lhs_packed_offset(m_step, K) / m_step; - auto b_tmp = ukernel_variant.interface.get_lhs_packed_offset(m_step * 16, K) / (m_step * 16); + auto n_step = ukernel_variant.interface.get_n_step(); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } - ASSERT_EQ(a_tmp, b_tmp); + const auto mr = ukernel_variant.interface.get_mr(); + const auto kr = ukernel_variant.interface.get_kr(); + const auto sr = ukernel_variant.interface.get_sr(); + + const auto lhs_start_row = rect.start_row(); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); } TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { @@ -180,6 +200,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, @@ -194,22 +216,29 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); const size_t ref_rhs_qsu4_stride = round_up_division(K, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * sizeof(uint16_t); + const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); std::vector imp_packed_rhs(imp_packed_rhs_size); - const auto packed_rhs_start_row = rect.start_col(); + const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(packed_rhs_start_row, K, nr, kr, sr, bl, scale_dt); + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); + + auto rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); + size_t bias_offset = rhs_start_row * sizeof(float); + size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; constexpr kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{ .lhs_zero_point = 1, .rhs_zero_point = 8, .scale_dt = kai_datatype::kai_dt_bf16}; kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( - 1, N, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data(), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), - ref_rhs_scales_stride, imp_packed_rhs.data(), 0, ¶ms); + 1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride, + reinterpret_cast(ref_biases.data() + bias_offset), + reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, + imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -222,8 +251,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { // Runs the GEMM micro-kernel. std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_packed_offset, - imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation for the portion @@ -314,6 +343,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, @@ -327,11 +358,17 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const auto ref_rhs_qsu4_padded = pad_row( ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const size_t ref_rhs_qsu4_stride = round_up_division(N, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * sizeof(uint16_t); + const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - const auto packed_rhs_start_row = rect.start_col(); + const auto rhs_start_row = rect.start_col(); + auto rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(packed_rhs_start_row, K, nr, kr, sr, bl, scale_dt); + kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); + + size_t bias_offset = rhs_start_row * sizeof(float); + size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); @@ -339,9 +376,9 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { constexpr kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{ .lhs_zero_point = 1, .rhs_zero_point = 8, .scale_dt = kai_datatype::kai_dt_bf16}; kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( - 1, N, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data(), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data()), ref_rhs_scales.data(), ref_rhs_scales_stride, - imp_packed_rhs.data(), 0, ¶ms); + 1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride, + reinterpret_cast(ref_biases.data() + bias_offset), ref_rhs_scales.data() + scale_offset, + ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -353,8 +390,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_packed_offset, - imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation. diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 402af683a0e7d014b5205b3f1c54c85298d4fe42..ae0415c24ff7d61370205f30e07c758a8f25aa12 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -56,12 +56,14 @@ enum class RhsPackType { NxK, KxN }; using ukernel_rhs_pack_function = std::function; using ukernel_get_rhs_packed_size = std::function; using ukernel_get_rhs_packed_offset = std::function; +using ukernel_get_rhs_offset = std::function; template struct UkernelVariantCustom : public UkernelVariant { ukernel_rhs_pack_function run_rhs_pack; ukernel_get_rhs_packed_size get_rhs_packed_size; ukernel_get_rhs_packed_offset get_rhs_packed_offset; + ukernel_get_rhs_offset get_rhs_offset; RhsPackType rhs_pack_type; bool signed_integer_support; @@ -70,12 +72,13 @@ struct UkernelVariantCustom : public UkernelVariant { UkernelVariantCustom( T interface, std::string_view name, const std::function& fn_is_supported, ukernel_rhs_pack_function run_rhs_pack, ukernel_get_rhs_packed_size get_rhs_packed_size, - ukernel_get_rhs_packed_offset get_rhs_packed_offset, const RhsPackType pack_type, - const bool signed_integer_support) : + ukernel_get_rhs_packed_offset get_rhs_packed_offset, ukernel_get_rhs_offset get_rhs_offset, + const RhsPackType pack_type, const bool signed_integer_support) : UkernelVariant(interface, name, fn_is_supported), run_rhs_pack(std::move(run_rhs_pack)), get_rhs_packed_size(std::move(get_rhs_packed_size)), get_rhs_packed_offset(std::move(get_rhs_packed_offset)), + get_rhs_offset(std::move(get_rhs_offset)), rhs_pack_type(pack_type), signed_integer_support(signed_integer_support) { } @@ -87,101 +90,182 @@ static const std::array imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 0}; ukernel_variant.run_rhs_pack( - 1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), reinterpret_cast(ref_biases.data()), - reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + 1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsi4_padded.data() + rhs_offset, + reinterpret_cast(ref_biases.data() + bias_offset), + reinterpret_cast(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset, + 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -280,8 +372,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, - imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation for the portion @@ -362,7 +454,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, @@ -377,15 +470,22 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); - const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); - ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); + const auto rhs_start_row = rect.start_col(); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(rhs_start_row, K, nr, kr, sr); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); + + auto rhs_offset = ukernel_variant.get_rhs_offset(rhs_start_row, round_up_division(K, 2)); + size_t bias_offset = rhs_start_row * sizeof(float); + size_t scale_offset = rhs_start_row * sizeof(float); std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 8}; ukernel_variant.run_rhs_pack( - 1, N, K, nr, kr, sr, ref_rhs_qsu4_padded.data(), reinterpret_cast(ref_biases.data()), - reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + 1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsu4_padded.data() + rhs_offset, + reinterpret_cast(ref_biases.data() + bias_offset), + reinterpret_cast(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset, + 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -396,15 +496,19 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + // Compares the output of the micro-kernels against the output of the reference implementation for the portion + // tested. + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -487,7 +591,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { std::vector imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, @@ -500,9 +605,11 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { const auto ref_rhs_qsi4_padded = pad_row( ref_rhs_qsi4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); - const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); - ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); + + const auto rhs_start_row = rect.start_col(); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(rhs_start_row, K, nr, kr, sr); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 0}; @@ -520,8 +627,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, - imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation for the portion @@ -614,7 +721,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, @@ -628,10 +736,11 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { const auto ref_rhs_qsu4_padded = pad_row( ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); - const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); - ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); + + const auto rhs_start_row = rect.start_col(); + auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(rhs_start_row, K, nr, kr, sr); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 8}; @@ -649,8 +758,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, - imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation. diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp index 34a44681c9f73619c54f4ca4e64e3f50c196da67..ccdc10851e1fa525e69da1589334e779a23f27a9 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -49,6 +49,51 @@ static const std::array +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -45,15 +45,20 @@ namespace kai::test { // Interface for the LHS and RHS packed size and packing functions using kai_get_lhs_packed_size_func_t = decltype(&kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32); using kai_get_rhs_packed_size_func_t = decltype(&kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); -using kai_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32); -using kai_rhs_pack_func_t = decltype(&kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); +using kai_get_lhs_packed_offset_func_t = decltype(&kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32); +using kai_get_rhs_packed_offset_func_t = + decltype(&kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); +using kai_run_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32); +using kai_run_rhs_pack_func_t = decltype(&kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0); // Micro-kernel interface struct kai_matmul_f32_qsi8d32p_qsi4c32p_pack_functions { kai_get_lhs_packed_size_func_t lhs_packed_size; kai_get_rhs_packed_size_func_t rhs_packed_size; - kai_lhs_pack_func_t lhs_pack; - kai_rhs_pack_func_t rhs_pack; + kai_get_lhs_packed_offset_func_t get_lhs_packed_offset; + kai_get_rhs_packed_offset_func_t get_rhs_packed_offset; + kai_run_lhs_pack_func_t lhs_pack; + kai_run_rhs_pack_func_t rhs_pack; }; static const std::array< @@ -84,6 +89,47 @@ static const std::array< class MatMulTest_f32_qsi8d32p_qsi4c32p : public UkernelVariantTest {}; +TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, Offset_RHS) { + const auto& [variant_index, matmul_shape] = GetParam(); + const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qsi8d32p_qsi4c32p.at(variant_index); + + if (ukernel_variant.ukernel.fn_is_supported && !ukernel_variant.ukernel.fn_is_supported()) { + GTEST_SKIP(); + } + + const size_t bl = 32; + const size_t K = matmul_shape.k; + const auto nr = ukernel_variant.ukernel.interface.get_nr(); + const auto kr = ukernel_variant.ukernel.interface.get_kr(); + auto n_step = ukernel_variant.ukernel.interface.get_n_step(); + + auto rhs_packed_offset = ukernel_variant.pack_interface.get_rhs_packed_offset(n_step, K, nr, kr, bl); + auto rhs_matmul_offset = ukernel_variant.ukernel.interface.get_rhs_packed_offset(n_step, K, bl); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); +} + +TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, Offset_LHS) { + const auto& [variant_index, matmul_shape] = GetParam(); + const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qsi8d32p_qsi4c32p.at(variant_index); + + if (ukernel_variant.ukernel.fn_is_supported && !ukernel_variant.ukernel.fn_is_supported()) { + GTEST_SKIP(); + } + + const size_t bl = 32; + const size_t K = matmul_shape.k; + const auto mr = ukernel_variant.ukernel.interface.get_mr(); + const auto kr = ukernel_variant.ukernel.interface.get_kr(); + const auto sr = ukernel_variant.ukernel.interface.get_sr(); + + auto m_step = ukernel_variant.ukernel.interface.get_m_step(); + + auto lhs_packed_offset = ukernel_variant.pack_interface.get_lhs_packed_offset(m_step, K, bl, mr, kr, sr); + auto lhs_matmul_offset = ukernel_variant.ukernel.interface.get_lhs_packed_offset(m_step, K, bl); + + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); +} + TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) { const auto& [variant_index, matmul_shape] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qsi8d32p_qsi4c32p.at(variant_index);