From b717a414a266f1599c7d5a3de63f256a69f37044 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 20 Jan 2025 12:08:11 +0000 Subject: [PATCH 1/4] Fix compilation issues - Add inclusion of the header file in the RHS packing functions - Add explicit cast in the BF16 packing functions Signed-off-by: Gian Marco Iodice --- .../pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c | 2 + .../pack/kai_lhs_pack_bf16p8x4_f16_neon.c | 6 +- .../pack/kai_lhs_pack_f32p2vlx1_f32_sme.c | 4 +- .../pack/kai_lhs_pack_x16p2vlx2_x16_sme.c | 4 +- .../kai_lhs_quant_pack_bf16p1x4_f32_neon.c | 6 +- .../kai_lhs_quant_pack_bf16p8x4_f32_neon.c | 10 +-- ...i_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h | 72 ++++++++++--------- ...i_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c | 8 ++- ...hs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c | 4 +- ...hs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c | 4 +- ...rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c | 4 +- ...hs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c | 4 +- ...k_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c | 2 +- ...s_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h | 17 ++++- .../kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c | 4 +- ...quant_pack_kxn_bf16p12x4biasf32_f32_neon.c | 13 ++-- 16 files changed, 105 insertions(+), 59 deletions(-) diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c index 7698ce5d..cc97c7b4 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c +++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c index 723de695..989c6e5f 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c +++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,8 @@ #error This file must be compiled for AArch64, FEAT_BF16, FEAT_FP16. #else // Architectural features check. +#include "kai_lhs_pack_bf16p8x4_f16_neon.h" + #include #include @@ -69,7 +71,7 @@ void kai_run_lhs_pack_bf16p8x4_f16_neon( size_t width = k; for (size_t y = 0; y < height; y++) { - in[y] = (char*)lhs + (block_y + y) * lhs_stride; + in[y] = (const char*)lhs + (block_y + y) * lhs_stride; } __asm__ __volatile__( diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c index 97268a64..8c2dd83e 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c +++ b/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_lhs_pack_f32p2vlx1_f32_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c index 6d467a98..ab131e84 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c +++ b/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_lhs_pack_x16p2vlx2_x16_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c index a53b0088..22fb3160 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_BF16. #else // Architectural features check. +#include "kai_lhs_quant_pack_bf16p1x4_f32_neon.h" + #include #include #include @@ -64,7 +66,7 @@ void kai_run_lhs_quant_pack_bf16p1x4_f32_neon( KAI_ASSUME(m_idx_start == 0); - const float* lhs_ptr = (float*)(lhs); + const float* lhs_ptr = (const float*)(lhs); uint16_t* lhs_packed_ptr = (uint16_t*)(lhs_packed); // Unroll two 256-bit loops diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c index 6022ac91..60862dc4 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #define MAX_MR 8 +#include "kai_lhs_quant_pack_bf16p8x4_f32_neon.h" + #include #include #include @@ -50,8 +52,8 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_bf16p8x4_f32_neon(size_t m, size_t } void kai_run_lhs_quant_pack_bf16p8x4_f32_neon( - size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, - uint16_t* lhs_packed) { + size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride, + void* lhs_packed) { KAI_ASSUME(mr == kai_mr); KAI_ASSUME(sr == kai_sr); KAI_ASSUME(kr == kai_kr); @@ -73,7 +75,7 @@ void kai_run_lhs_quant_pack_bf16p8x4_f32_neon( size_t width = k; for (size_t y = 0; y < height; y++) { - in[y] = (char*)lhs + (block_y + y) * lhs_stride; + in[y] = (const char*)lhs + (block_y + y) * lhs_stride; } __asm__ __volatile__( diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h index cd386302..bbaa44fc 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -41,42 +41,48 @@ size_t kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n_idx); /// @return The offset in bytes to the data element. size_t kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n_idx, size_t k); -/// Gets the size in bytes of the packed RHS buffer. +/// Get the row stride in bytes to the packed RHS matrix /// -/// @param[in] n Number of rows. /// @param[in] k Number of columns. /// -/// @return The size in bytes of the packed RHS buffer. -size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k); +/// @return The row stride in bytes to the packed RHS matrix. +size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k) { + /// Gets the size in bytes of the packed RHS buffer. + /// + /// @param[in] n Number of rows. + /// @param[in] k Number of columns. + /// + /// @return The size in bytes of the packed RHS buffer. + size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k); -/// Runs the RHS packing function for matrix multiplication. -/// -/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset -/// calculated using the following functions: -/// -/// * RHS: @ref -/// kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. -/// * Bias: @ref -/// kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. -/// * Output: @ref -/// kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. -/// -/// @param[in] num_groups Number of groups. It must be 1. -/// @param[in] n Number of columns of the output matrix. -/// @param[in] k Common dimension between the LHS and RHS matrix. -/// @param[in] nr Block size in N dimension. It must be 12. -/// @param[in] kr Block size in K dimension. It must be 4. -/// @param[in] sr Number of kr splits. It must be 1. -/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. -/// @param[in] rhs RHS matrix data buffer. -/// @param[in] bias Bias matrix data buffer. -/// @param[in] scale Scale data buffer. It must be NULL. -/// @param[out] rhs_packed Packed RHS matrix. -/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. -/// @param[in] params Extra packing parameters. It must be NULL. -void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( - size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, - const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); + /// Runs the RHS packing function for matrix multiplication. + /// + /// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset + /// calculated using the following functions: + /// + /// * RHS: @ref + /// kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. + /// * Bias: @ref + /// kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. + /// * Output: @ref + /// kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. + /// + /// @param[in] num_groups Number of groups. It must be 1. + /// @param[in] n Number of columns of the output matrix. + /// @param[in] k Common dimension between the LHS and RHS matrix. + /// @param[in] nr Block size in N dimension. It must be 12. + /// @param[in] kr Block size in K dimension. It must be 4. + /// @param[in] sr Number of kr splits. It must be 1. + /// @param[in] rhs_stride Row stride in bytes of the RHS matrix. + /// @param[in] rhs RHS matrix data buffer. + /// @param[in] bias Bias matrix data buffer. + /// @param[in] scale Scale data buffer. It must be NULL. + /// @param[out] rhs_packed Packed RHS matrix. + /// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. + /// @param[in] params Extra packing parameters. It must be NULL. + void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); #ifdef __cplusplus } // extern "C" diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c index 2c4d5e5c..db291525 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -9,6 +9,8 @@ #error This file must be compiled for AArch64, FEAT_BF16, FEAT_FP16. #else // Architectural features check. +#include "kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.h" + #include #include #include @@ -61,7 +63,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf32_f16_neon( const void* in = rhs; void* out = rhs_packed; const size_t in_stride = rhs_stride; - uint16_t* pad_row = (uint16_t*)rhs; + const uint16_t* pad_row = (const uint16_t*)rhs; // Fill zeros if bias is nullptr size_t bias_step = nr * sizeof(float); @@ -72,7 +74,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf32_f16_neon( bias_step = 0; } - const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias; + const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias; size_t out_stride = kai_nr * kai_roundup(height, kai_kr) * sizeof(uint16_t) + kai_nr * sizeof(uint32_t); diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c index 61b8ba48..d85533d9 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64. #else // Architectural features check. +#include "kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c index 5ce709fa..55e49474 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c index 0c6b0074..afa3d8b5 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64. #else // Architectural features check. +#include "kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c index b8fd3fdc..29870b43 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c index 419f5cd0..cb32cdc2 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h index f06e736c..b4da82ca 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -50,6 +50,21 @@ size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0( size_t kr, // size_t bl); // +/// Gets the size in bytes for the quantized and packed RHS matrix. +/// +/// @param[in] k The number of columns in the RHS matrix (not packed). +/// @param[in] nr The number of columns written by the matmul micro-kernel +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] bl The block length, which defines the number of K values stored in a single block. It must be a multiple +/// of 32. +/// +/// @return the packed RHS matrix size in bytes +size_t kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0( + size_t k, // + size_t nr, // + size_t kr, // + size_t bl); // + /// Gets the size in bytes for the quantized and packed RHS matrix. /// /// @param[in] n The number of rows in the RHS matrix (not packed) diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c index 4a471441..670dda9d 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c b/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c index 67119418..d22136f0 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #define MAX_NR 12 +#include "kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.h" + #include #include #include @@ -21,9 +23,8 @@ static const size_t kai_nr = 12; static const size_t kai_kr = 4; static const size_t kai_sr = 1; -size_t kai_get_n_step_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t nr) { - KAI_ASSUME(kai_nr == nr); - return nr; +size_t kai_get_n_step_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(void) { + return kai_nr; } size_t kai_get_rhs_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t n_idx) { @@ -66,7 +67,7 @@ void kai_run_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon( size_t height = k; const size_t width = n; - const void* in = (void*)rhs; + const void* in = (const void*)rhs; void* out = rhs_packed; const size_t in_stride = rhs_stride; const float* pad_row = rhs; @@ -80,7 +81,7 @@ void kai_run_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon( bias_step = 0; } - const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias; + const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias; const size_t out_stride = nr * kai_roundup(height, kr) * sizeof(uint16_t) + nr * sizeof(uint32_t); -- GitLab From 7a68971b28598d0333e76427d3b0ada17cb26699 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 20 Jan 2025 14:38:42 +0000 Subject: [PATCH 2/4] Fix the header file inclusion in other kernels Signed-off-by: Gian Marco Iodice --- ...tmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c | 2 ++ ...mp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c | 8 ++++---- ..._clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c | 8 ++++---- ...ul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c | 2 ++ .../matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c | 2 +- .../pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h | 7 +++++++ 6 files changed, 20 insertions(+), 9 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c index 7e1a3ade..0d6e68d2 100644 --- a/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c +++ b/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" + #include #include #include diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c index b1911981..b76754f2 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c @@ -158,10 +158,10 @@ void kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa( const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(); - const uint16_t* lhs_scales = - (uint16_t*)((const int8_t*)lhs_packed + lhs_packed_stride - (mr * num_blocks) * kai_num_bytes_multiplier_lhs); - const uint16_t* rhs_scales = - (uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - (nr * num_blocks) * kai_num_bytes_multiplier_rhs); + const uint16_t* lhs_scales = (const uint16_t*)((const uint8_t*)lhs_packed + lhs_packed_stride - + (mr * num_blocks) * kai_num_bytes_multiplier_lhs); + const uint16_t* rhs_scales = (const uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - + (nr * num_blocks) * kai_num_bytes_multiplier_rhs); __asm__ volatile( // Switch to streaming mode with ZA enabling diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c index 002b55fa..9c27a588 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c @@ -161,10 +161,10 @@ void kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot( const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(); - const uint16_t* lhs_scales = - (uint16_t*)((const int8_t*)lhs_packed + lhs_packed_stride - (mr * num_blocks) * kai_num_bytes_multiplier_lhs); - const uint16_t* rhs_scales = - (uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - (nr * num_blocks) * kai_num_bytes_multiplier_rhs); + const uint16_t* lhs_scales = (const uint16_t*)((const uint8_t*)lhs_packed + lhs_packed_stride - + (mr * num_blocks) * kai_num_bytes_multiplier_lhs); + const uint16_t* rhs_scales = (const uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - + (nr * num_blocks) * kai_num_bytes_multiplier_rhs); __asm__ volatile( // Switch to streaming mode with ZA enabling diff --git a/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c index 4d6e4f00..902b11e7 100644 --- a/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c +++ b/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c @@ -8,6 +8,8 @@ #error This file must be compiled for AArch64, FEAT_SVE2. #else // Architectural features check. +#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h" + #include #include diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c index cc97c7b4..272942df 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c +++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c @@ -83,7 +83,7 @@ void kai_run_lhs_pack_bf16p2vlx2_f32_sme( void* out = (void*)((char*)lhs_packed + block_y * kai_roundup(k, kai_kr) * sizeof(uint16_t)); for (size_t y = 0; y < height; y++) { - in[y] = (void*)((char*)lhs + (block_y + y) * lhs_stride); + in[y] = (const void*)((const char*)lhs + (block_y + y) * lhs_stride); } __asm__ __volatile__( diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h index 8bd3f6ee..bfd5c393 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h @@ -42,6 +42,13 @@ size_t kai_get_bias_offset_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t n_id /// @return The offset in bytes to the data element. size_t kai_get_rhs_packed_offset_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t n_idx, size_t k); +/// Get the row stride in bytes to the packed RHS matrix +/// +/// @param[in] k The number of columns in the RHS matrix (not packed). +/// +/// @return the stride in bytes to the packed RHS matrix +size_t kai_get_rhs_packed_stride_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t k); + /// Gets the size in bytes of the packed RHS buffer. /// /// @param[in] n Number of rows. -- GitLab From 82ccf562cb200936acba8dd9b0a772746a9347a4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 20 Jan 2025 17:01:04 +0000 Subject: [PATCH 3/4] Fix compilation issue - Missing ; for the end of function declaration in the header file Signed-off-by: Gian Marco Iodice --- ...i_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h index bbaa44fc..27d03346 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h @@ -46,43 +46,43 @@ size_t kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n /// @param[in] k Number of columns. /// /// @return The row stride in bytes to the packed RHS matrix. -size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k) { - /// Gets the size in bytes of the packed RHS buffer. - /// - /// @param[in] n Number of rows. - /// @param[in] k Number of columns. - /// - /// @return The size in bytes of the packed RHS buffer. - size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k); +size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k); +/// Gets the size in bytes of the packed RHS buffer. +/// +/// @param[in] n Number of rows. +/// @param[in] k Number of columns. +/// +/// @return The size in bytes of the packed RHS buffer. +size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k); - /// Runs the RHS packing function for matrix multiplication. - /// - /// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset - /// calculated using the following functions: - /// - /// * RHS: @ref - /// kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. - /// * Bias: @ref - /// kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. - /// * Output: @ref - /// kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. - /// - /// @param[in] num_groups Number of groups. It must be 1. - /// @param[in] n Number of columns of the output matrix. - /// @param[in] k Common dimension between the LHS and RHS matrix. - /// @param[in] nr Block size in N dimension. It must be 12. - /// @param[in] kr Block size in K dimension. It must be 4. - /// @param[in] sr Number of kr splits. It must be 1. - /// @param[in] rhs_stride Row stride in bytes of the RHS matrix. - /// @param[in] rhs RHS matrix data buffer. - /// @param[in] bias Bias matrix data buffer. - /// @param[in] scale Scale data buffer. It must be NULL. - /// @param[out] rhs_packed Packed RHS matrix. - /// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. - /// @param[in] params Extra packing parameters. It must be NULL. - void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( - size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, - const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); +/// Runs the RHS packing function for matrix multiplication. +/// +/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset +/// calculated using the following functions: +/// +/// * RHS: @ref +/// kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. +/// * Bias: @ref +/// kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. +/// * Output: @ref +/// kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon. +/// +/// @param[in] num_groups Number of groups. It must be 1. +/// @param[in] n Number of columns of the output matrix. +/// @param[in] k Common dimension between the LHS and RHS matrix. +/// @param[in] nr Block size in N dimension. It must be 12. +/// @param[in] kr Block size in K dimension. It must be 4. +/// @param[in] sr Number of kr splits. It must be 1. +/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. +/// @param[in] rhs RHS matrix data buffer. +/// @param[in] bias Bias matrix data buffer. +/// @param[in] scale Scale data buffer. It must be NULL. +/// @param[out] rhs_packed Packed RHS matrix. +/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. +/// @param[in] params Extra packing parameters. It must be NULL. +void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); #ifdef __cplusplus } // extern "C" -- GitLab From 764fd8949dc407e8a438ea855d911bb239fcc646 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 20 Jan 2025 17:07:44 +0000 Subject: [PATCH 4/4] Fix explicit cast in Matmul BF16P12x4BIASF16_F16 Signed-off-by: Gian Marco Iodice --- .../matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c index cff2dd0f..fae15ccb 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c @@ -69,7 +69,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( const void* in = rhs; void* out = rhs_packed; const size_t in_stride = rhs_stride; - const uint16_t* pad_row = (uint16_t*)rhs; + const uint16_t* pad_row = (const uint16_t*)rhs; // Fill zeros if bias is nullptr size_t bias_step = nr * sizeof(uint16_t); @@ -80,7 +80,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon( bias_step = 0; } - const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias; + const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias; size_t out_stride = kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(height); -- GitLab