diff --git a/CHANGELOG.md b/CHANGELOG.md index 965ff197dcad3abe47a027b8b751426181e193b1..006b7bbdb04105367359169fbcfabb2ef1482969 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - New 4x8 block size variant of matrix multiplication of QAI8DXP LHS and QSI4C32P RHS with F32 output. - Optimizations for FEAT_DotProd. - Added demonstration of integration using CMake in F16 Arm® Neon™ matrix multiplication example. +- Fixes: + - Fix the RHS packing micro-kernel kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon to handle null bias. ## v1.3.0 diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.c index a594f9d1190f66285fcd2da435655eab3f26de09..d0c66276369fbe141c8bd5c185563c3fb16afe89 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.c @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -97,18 +97,28 @@ void kai_run_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon( // Copy the scaling factors and bias size_t rows_left = n - row_idx; + // Saving scales. if (rows_left >= nr) { memcpy(scaling_factors, &scale[row_idx], nr * kai_num_bytes_multiplier_rhs); - memcpy(biases, &bias[row_idx], nr * kai_num_bytes_bias); } else { // Fill remaining values memcpy(scaling_factors, &scale[row_idx], rows_left * kai_num_bytes_multiplier_rhs); - memcpy(biases, &bias[row_idx], rows_left * kai_num_bytes_bias); // Set leftover to 0 memset(&scaling_factors[rows_left], 0, (nr - rows_left) * kai_num_bytes_multiplier_rhs); - memset(&biases[rows_left], 0, (nr - rows_left) * kai_num_bytes_bias); } - + if (bias == NULL) { + // Set bias to 0 + memset(biases, 0, nr * kai_num_bytes_bias); + } else { + if (rows_left >= nr) { + memcpy(biases, &bias[row_idx], nr * kai_num_bytes_bias); + } else { + // Fill remaining values + memcpy(biases, &bias[row_idx], rows_left * kai_num_bytes_bias); + // Set leftover to 0 + memset(&biases[rows_left], 0, (nr - rows_left) * kai_num_bytes_bias); + } + } // Iterate over rows in the nr row block for (size_t nr_block_idx = 0; nr_block_idx < nr; ++nr_block_idx) { const uint8_t* const src_row = rhs + ((row_idx + nr_block_idx) * rhs_stride); diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h index 07e953d1f5d8b60373386f319017fc401a3c658d..6e94913af2e3eeffe998e89ef4e696bd9887b9a7 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -89,7 +89,7 @@ size_t kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon( /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. /// @param[in] rhs The RHS matrix containing the 4-bit values. /// Size in bytes is expected to be greater than or equal to n * k * (sizeof(uint8_t) / 2). -/// @param[in] bias The biases. +/// @param[in] bias The biases. The bias is set to 0.f if this argument is NULL. /// @param[in] scale The scale for each output channel. /// @param[out] rhs_packed The packed RHS matrix. /// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix.