From b717a414a266f1599c7d5a3de63f256a69f37044 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 20 Jan 2025 12:08:11 +0000
Subject: [PATCH 1/4] Fix compilation issues

- Add inclusion of the header file in the RHS packing functions
- Add explicit cast in the BF16 packing functions

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 .../pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c    |  2 +
 .../pack/kai_lhs_pack_bf16p8x4_f16_neon.c     |  6 +-
 .../pack/kai_lhs_pack_f32p2vlx1_f32_sme.c     |  4 +-
 .../pack/kai_lhs_pack_x16p2vlx2_x16_sme.c     |  4 +-
 .../kai_lhs_quant_pack_bf16p1x4_f32_neon.c    |  6 +-
 .../kai_lhs_quant_pack_bf16p8x4_f32_neon.c    | 10 +--
 ...i_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h | 72 ++++++++++---------
 ...i_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c |  8 ++-
 ...hs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c |  4 +-
 ...hs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c |  4 +-
 ...rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c |  4 +-
 ...hs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c |  4 +-
 ...k_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c |  2 +-
 ...s_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h | 17 ++++-
 .../kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c |  4 +-
 ...quant_pack_kxn_bf16p12x4biasf32_f32_neon.c | 13 ++--
 16 files changed, 105 insertions(+), 59 deletions(-)

diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
index 7698ce5d..cc97c7b4 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c
index 723de695..989c6e5f 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -9,6 +9,8 @@
 #error This file must be compiled for AArch64, FEAT_BF16, FEAT_FP16.
 #else  // Architectural features check.
 
+#include "kai_lhs_pack_bf16p8x4_f16_neon.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
@@ -69,7 +71,7 @@ void kai_run_lhs_pack_bf16p8x4_f16_neon(
         size_t width = k;
 
         for (size_t y = 0; y < height; y++) {
-            in[y] = (char*)lhs + (block_y + y) * lhs_stride;
+            in[y] = (const char*)lhs + (block_y + y) * lhs_stride;
         }
 
         __asm__ __volatile__(
diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c
index 97268a64..8c2dd83e 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_lhs_pack_f32p2vlx1_f32_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c
index 6d467a98..ab131e84 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_lhs_pack_x16p2vlx2_x16_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c
index a53b0088..22fb3160 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_BF16.
 #else  // Architectural features check.
 
+#include "kai_lhs_quant_pack_bf16p1x4_f32_neon.h"
+
 #include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -64,7 +66,7 @@ void kai_run_lhs_quant_pack_bf16p1x4_f32_neon(
 
     KAI_ASSUME(m_idx_start == 0);
 
-    const float* lhs_ptr = (float*)(lhs);
+    const float* lhs_ptr = (const float*)(lhs);
     uint16_t* lhs_packed_ptr = (uint16_t*)(lhs_packed);
 
     // Unroll two 256-bit loops
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c
index 6022ac91..60862dc4 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 
 #define MAX_MR 8
 
+#include "kai_lhs_quant_pack_bf16p8x4_f32_neon.h"
+
 #include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -50,8 +52,8 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_bf16p8x4_f32_neon(size_t m, size_t
 }
 
 void kai_run_lhs_quant_pack_bf16p8x4_f32_neon(
-    size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride,
-    uint16_t* lhs_packed) {
+    size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride,
+    void* lhs_packed) {
     KAI_ASSUME(mr == kai_mr);
     KAI_ASSUME(sr == kai_sr);
     KAI_ASSUME(kr == kai_kr);
@@ -73,7 +75,7 @@ void kai_run_lhs_quant_pack_bf16p8x4_f32_neon(
         size_t width = k;
 
         for (size_t y = 0; y < height; y++) {
-            in[y] = (char*)lhs + (block_y + y) * lhs_stride;
+            in[y] = (const char*)lhs + (block_y + y) * lhs_stride;
         }
 
         __asm__ __volatile__(
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
index cd386302..bbaa44fc 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -41,42 +41,48 @@ size_t kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n_idx);
 /// @return The offset in bytes to the data element.
 size_t kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n_idx, size_t k);
 
-/// Gets the size in bytes of the packed RHS buffer.
+/// Get the row stride in bytes to the packed RHS matrix
 ///
-/// @param[in] n Number of rows.
 /// @param[in] k Number of columns.
 ///
-/// @return The size in bytes of the packed RHS buffer.
-size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k);
+/// @return The row stride in bytes to the packed RHS matrix.
+size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k) {
+    /// Gets the size in bytes of the packed RHS buffer.
+    ///
+    /// @param[in] n Number of rows.
+    /// @param[in] k Number of columns.
+    ///
+    /// @return The size in bytes of the packed RHS buffer.
+    size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k);
 
-/// Runs the RHS packing function for matrix multiplication.
-///
-/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset
-/// calculated using the following functions:
-///
-///   * RHS: @ref
-///   kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-///   * Bias: @ref
-///   kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-///   * Output: @ref
-///   kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-///
-/// @param[in] num_groups Number of groups. It must be 1.
-/// @param[in] n Number of columns of the output matrix.
-/// @param[in] k Common dimension between the LHS and RHS matrix.
-/// @param[in] nr Block size in N dimension. It must be 12.
-/// @param[in] kr Block size in K dimension. It must be 4.
-/// @param[in] sr Number of kr splits. It must be 1.
-/// @param[in] rhs_stride Row stride in bytes of the RHS matrix.
-/// @param[in] rhs RHS matrix data buffer.
-/// @param[in] bias Bias matrix data buffer.
-/// @param[in] scale Scale data buffer. It must be NULL.
-/// @param[out] rhs_packed Packed RHS matrix.
-/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0.
-/// @param[in] params Extra packing parameters. It must be NULL.
-void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
-    size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
-    const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params);
+    /// Runs the RHS packing function for matrix multiplication.
+    ///
+    /// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset
+    /// calculated using the following functions:
+    ///
+    ///   * RHS: @ref
+    ///   kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+    ///   * Bias: @ref
+    ///   kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+    ///   * Output: @ref
+    ///   kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+    ///
+    /// @param[in] num_groups Number of groups. It must be 1.
+    /// @param[in] n Number of columns of the output matrix.
+    /// @param[in] k Common dimension between the LHS and RHS matrix.
+    /// @param[in] nr Block size in N dimension. It must be 12.
+    /// @param[in] kr Block size in K dimension. It must be 4.
+    /// @param[in] sr Number of kr splits. It must be 1.
+    /// @param[in] rhs_stride Row stride in bytes of the RHS matrix.
+    /// @param[in] rhs RHS matrix data buffer.
+    /// @param[in] bias Bias matrix data buffer.
+    /// @param[in] scale Scale data buffer. It must be NULL.
+    /// @param[out] rhs_packed Packed RHS matrix.
+    /// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0.
+    /// @param[in] params Extra packing parameters. It must be NULL.
+    void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
+        size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
+        const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c
index 2c4d5e5c..db291525 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -9,6 +9,8 @@
 #error This file must be compiled for AArch64, FEAT_BF16, FEAT_FP16.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.h"
+
 #include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -61,7 +63,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf32_f16_neon(
     const void* in = rhs;
     void* out = rhs_packed;
     const size_t in_stride = rhs_stride;
-    uint16_t* pad_row = (uint16_t*)rhs;
+    const uint16_t* pad_row = (const uint16_t*)rhs;
 
     // Fill zeros if bias is nullptr
     size_t bias_step = nr * sizeof(float);
@@ -72,7 +74,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf32_f16_neon(
         bias_step = 0;
     }
 
-    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias;
+    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias;
 
     size_t out_stride = kai_nr * kai_roundup(height, kai_kr) * sizeof(uint16_t) + kai_nr * sizeof(uint32_t);
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
index 61b8ba48..d85533d9 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c
index 5ce709fa..55e49474 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
index 0c6b0074..afa3d8b5 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c
index b8fd3fdc..29870b43 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
index 419f5cd0..cb32cdc2 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h
index f06e736c..b4da82ca 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -50,6 +50,21 @@ size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
     size_t kr,     //
     size_t bl);    //
 
+/// Gets the size in bytes for the quantized and packed RHS matrix.
+///
+/// @param[in] k  The number of columns in the RHS matrix (not packed).
+/// @param[in] nr The number of columns written by the matmul micro-kernel
+/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in] bl The block length, which defines the number of K values stored in a single block. It must be a multiple
+/// of 32.
+///
+/// @return the packed RHS matrix size in bytes
+size_t kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
+    size_t k,    //
+    size_t nr,   //
+    size_t kr,   //
+    size_t bl);  //
+
 /// Gets the size in bytes for the quantized and packed RHS matrix.
 ///
 /// @param[in] n  The number of rows in the RHS matrix (not packed)
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c
index 4a471441..670dda9d 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_rhs_pack_nxk_x16p2vlx2b_x16_x16_sme.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c b/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c
index 67119418..d22136f0 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 
 #define MAX_NR 12
 
+#include "kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.h"
+
 #include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -21,9 +23,8 @@ static const size_t kai_nr = 12;
 static const size_t kai_kr = 4;
 static const size_t kai_sr = 1;
 
-size_t kai_get_n_step_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t nr) {
-    KAI_ASSUME(kai_nr == nr);
-    return nr;
+size_t kai_get_n_step_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(void) {
+    return kai_nr;
 }
 
 size_t kai_get_rhs_offset_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(size_t n_idx) {
@@ -66,7 +67,7 @@ void kai_run_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(
 
     size_t height = k;
     const size_t width = n;
-    const void* in = (void*)rhs;
+    const void* in = (const void*)rhs;
     void* out = rhs_packed;
     const size_t in_stride = rhs_stride;
     const float* pad_row = rhs;
@@ -80,7 +81,7 @@ void kai_run_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon(
         bias_step = 0;
     }
 
-    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias;
+    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias;
 
     const size_t out_stride = nr * kai_roundup(height, kr) * sizeof(uint16_t) + nr * sizeof(uint32_t);
 
-- 
GitLab


From 7a68971b28598d0333e76427d3b0ada17cb26699 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 20 Jan 2025 14:38:42 +0000
Subject: [PATCH 2/4] Fix the header file inclusion in other kernels

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 ...tmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c | 2 ++
 ...mp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c | 8 ++++----
 ..._clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c | 8 ++++----
 ...ul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c | 2 ++
 .../matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c         | 2 +-
 .../pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h  | 7 +++++++
 6 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
index 7e1a3ade..0d6e68d2 100644
--- a/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
+++ b/kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.c
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
+
 #include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
index b1911981..b76754f2 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
@@ -158,10 +158,10 @@ void kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa(
     const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa();
     const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa();
 
-    const uint16_t* lhs_scales =
-        (uint16_t*)((const int8_t*)lhs_packed + lhs_packed_stride - (mr * num_blocks) * kai_num_bytes_multiplier_lhs);
-    const uint16_t* rhs_scales =
-        (uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - (nr * num_blocks) * kai_num_bytes_multiplier_rhs);
+    const uint16_t* lhs_scales = (const uint16_t*)((const uint8_t*)lhs_packed + lhs_packed_stride -
+                                                   (mr * num_blocks) * kai_num_bytes_multiplier_lhs);
+    const uint16_t* rhs_scales = (const uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride -
+                                                   (nr * num_blocks) * kai_num_bytes_multiplier_rhs);
 
     __asm__ volatile(
         // Switch to streaming mode with ZA enabling
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
index 002b55fa..9c27a588 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
@@ -161,10 +161,10 @@ void kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot(
     const size_t mr = kai_get_mr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot();
     const size_t nr = kai_get_nr_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot();
 
-    const uint16_t* lhs_scales =
-        (uint16_t*)((const int8_t*)lhs_packed + lhs_packed_stride - (mr * num_blocks) * kai_num_bytes_multiplier_lhs);
-    const uint16_t* rhs_scales =
-        (uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride - (nr * num_blocks) * kai_num_bytes_multiplier_rhs);
+    const uint16_t* lhs_scales = (const uint16_t*)((const uint8_t*)lhs_packed + lhs_packed_stride -
+                                                   (mr * num_blocks) * kai_num_bytes_multiplier_lhs);
+    const uint16_t* rhs_scales = (const uint16_t*)((const uint8_t*)rhs_packed + rhs_packed_stride -
+                                                   (nr * num_blocks) * kai_num_bytes_multiplier_rhs);
 
     __asm__ volatile(
         // Switch to streaming mode with ZA enabling
diff --git a/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c b/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
index 4d6e4f00..902b11e7 100644
--- a/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+++ b/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
@@ -8,6 +8,8 @@
 #error This file must be compiled for AArch64, FEAT_SVE2.
 #else  // Architectural features check.
 
+#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
+
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
index cc97c7b4..272942df 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
@@ -83,7 +83,7 @@ void kai_run_lhs_pack_bf16p2vlx2_f32_sme(
         void* out = (void*)((char*)lhs_packed + block_y * kai_roundup(k, kai_kr) * sizeof(uint16_t));
 
         for (size_t y = 0; y < height; y++) {
-            in[y] = (void*)((char*)lhs + (block_y + y) * lhs_stride);
+            in[y] = (const void*)((const char*)lhs + (block_y + y) * lhs_stride);
         }
 
         __asm__ __volatile__(
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h
index 8bd3f6ee..bfd5c393 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme.h
@@ -42,6 +42,13 @@ size_t kai_get_bias_offset_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t n_id
 /// @return The offset in bytes to the data element.
 size_t kai_get_rhs_packed_offset_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t n_idx, size_t k);
 
+/// Get the row stride in bytes to the packed RHS matrix
+///
+/// @param[in] k        The number of columns in the RHS matrix (not packed).
+///
+/// @return the stride in bytes to the packed RHS matrix
+size_t kai_get_rhs_packed_stride_rhs_pack_nxk_f32p2vlx1biasf32_f32_f32_sme(size_t k);
+
 /// Gets the size in bytes of the packed RHS buffer.
 ///
 /// @param[in] n Number of rows.
-- 
GitLab


From 82ccf562cb200936acba8dd9b0a772746a9347a4 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 20 Jan 2025 17:01:04 +0000
Subject: [PATCH 3/4] Fix compilation issue

- Missing ; for the end of function declaration in the header file

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 ...i_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h | 72 +++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
index bbaa44fc..27d03346 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.h
@@ -46,43 +46,43 @@ size_t kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n
 /// @param[in] k Number of columns.
 ///
 /// @return The row stride in bytes to the packed RHS matrix.
-size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k) {
-    /// Gets the size in bytes of the packed RHS buffer.
-    ///
-    /// @param[in] n Number of rows.
-    /// @param[in] k Number of columns.
-    ///
-    /// @return The size in bytes of the packed RHS buffer.
-    size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k);
+size_t kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t k);
+/// Gets the size in bytes of the packed RHS buffer.
+///
+/// @param[in] n Number of rows.
+/// @param[in] k Number of columns.
+///
+/// @return The size in bytes of the packed RHS buffer.
+size_t kai_get_rhs_packed_size_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(size_t n, size_t k);
 
-    /// Runs the RHS packing function for matrix multiplication.
-    ///
-    /// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset
-    /// calculated using the following functions:
-    ///
-    ///   * RHS: @ref
-    ///   kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-    ///   * Bias: @ref
-    ///   kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-    ///   * Output: @ref
-    ///   kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
-    ///
-    /// @param[in] num_groups Number of groups. It must be 1.
-    /// @param[in] n Number of columns of the output matrix.
-    /// @param[in] k Common dimension between the LHS and RHS matrix.
-    /// @param[in] nr Block size in N dimension. It must be 12.
-    /// @param[in] kr Block size in K dimension. It must be 4.
-    /// @param[in] sr Number of kr splits. It must be 1.
-    /// @param[in] rhs_stride Row stride in bytes of the RHS matrix.
-    /// @param[in] rhs RHS matrix data buffer.
-    /// @param[in] bias Bias matrix data buffer.
-    /// @param[in] scale Scale data buffer. It must be NULL.
-    /// @param[out] rhs_packed Packed RHS matrix.
-    /// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0.
-    /// @param[in] params Extra packing parameters. It must be NULL.
-    void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
-        size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
-        const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params);
+/// Runs the RHS packing function for matrix multiplication.
+///
+/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset
+/// calculated using the following functions:
+///
+///   * RHS: @ref
+///   kai_get_rhs_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+///   * Bias: @ref
+///   kai_get_bias_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+///   * Output: @ref
+///   kai_get_rhs_packed_offset_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.
+///
+/// @param[in] num_groups Number of groups. It must be 1.
+/// @param[in] n Number of columns of the output matrix.
+/// @param[in] k Common dimension between the LHS and RHS matrix.
+/// @param[in] nr Block size in N dimension. It must be 12.
+/// @param[in] kr Block size in K dimension. It must be 4.
+/// @param[in] sr Number of kr splits. It must be 1.
+/// @param[in] rhs_stride Row stride in bytes of the RHS matrix.
+/// @param[in] rhs RHS matrix data buffer.
+/// @param[in] bias Bias matrix data buffer.
+/// @param[in] scale Scale data buffer. It must be NULL.
+/// @param[out] rhs_packed Packed RHS matrix.
+/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0.
+/// @param[in] params Extra packing parameters. It must be NULL.
+void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
+    size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
+    const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params);
 
 #ifdef __cplusplus
 }  // extern "C"
-- 
GitLab


From 764fd8949dc407e8a438ea855d911bb239fcc646 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 20 Jan 2025 17:07:44 +0000
Subject: [PATCH 4/4] Fix explicit cast in Matmul BF16P12x4BIASF16_F16

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 .../matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c
index cff2dd0f..fae15ccb 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c
@@ -69,7 +69,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
     const void* in = rhs;
     void* out = rhs_packed;
     const size_t in_stride = rhs_stride;
-    const uint16_t* pad_row = (uint16_t*)rhs;
+    const uint16_t* pad_row = (const uint16_t*)rhs;
 
     // Fill zeros if bias is nullptr
     size_t bias_step = nr * sizeof(uint16_t);
@@ -80,7 +80,7 @@ void kai_run_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(
         bias_step = 0;
     }
 
-    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (void*)bias;
+    const void* bias_ptr = bias == NULL ? (void*)zero_bias : (const void*)bias;
 
     size_t out_stride = kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p12x4biasf16_f16_neon(height);
 
-- 
GitLab