diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1247ef29981e6b313615168eca23bec690e20b09..897a124a46a327edf676306bc112246a2abeb007 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo
 
 ## Upcoming Release
 
+- Breaking changes:
+  - Rename kai_lhs_quant_pack_qsi8d32p_f32_neon to kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon
+  - Rename kai_lhs_quant_pack_qsi8d32p_f32 to kai_lhs_quant_pack_qsi8d32p_f32_neon
+
 ## v1.6.0
 
 - Add CMake installation and `find_package()` support.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53ce829f60748d381793219e160eaae6fcf4af00..6c4bb5f6b22d541553f62f08341052719a531b31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,38 +87,38 @@ endif()
 
 set(KLEIDIAI_FILES_SCALAR
     kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
-    kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.c
     kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c
     kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
     kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c
-)
+    )
 
-set(KLEIDIAI_FILES_NEON_FP16
+    set(KLEIDIAI_FILES_NEON_FP16
     kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
-)
+    )
 
-set(KLEIDIAI_FILES_NEON_BF16
+    set(KLEIDIAI_FILES_NEON_BF16
     kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p1x4_bf16p12x4b_1x36_neon_dot.c
     kai/ukernels/matmul/matmul_clamp_f32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p8x4_bf16p12x4b_8x12_neon_mmla.c
     kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p1x4_f32_neon.c
     kai/ukernels/matmul/pack/kai_lhs_quant_pack_bf16p8x4_f32_neon.c
     kai/ukernels/matmul/pack/kai_rhs_quant_pack_kxn_bf16p12x4biasf32_f32_neon.c
-)
+    )
 
-set(KLEIDIAI_FILES_NEON_FP16_BF16
+    set(KLEIDIAI_FILES_NEON_FP16_BF16
     kai/ukernels/matmul/matmul_clamp_f16_bf16p_bf16p/kai_matmul_clamp_f16_bf16p8x4_bf16p12x4b_8x12_neon_mmla.c
     kai/ukernels/matmul/pack/kai_lhs_pack_bf16p8x4_f16_neon.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf16_f16_neon.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p12x4biasf32_f16_neon.c
-)
+    )
 
 set(KLEIDIAI_FILES_NEON
     kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
     kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S
     kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+    kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.c
     kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt
index 8159b1c363297c06c71c3e1d1b93e88300ffe9b8..528bd5500bd31f8c5673f53ecbd8edeb49be174c 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -24,7 +24,7 @@ include_directories(
 add_executable(matmul_clamp_f32_qsi8d32p_qsi4c32p
     matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
-    ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
+    ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c
     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
index 6d992b625e4387f7d5e7230261764ce11119e545..445ec50e5ae077970b46fd714faa9191ead32eed 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -15,7 +15,7 @@
 #include <string>
 
 // Include micro-kernel variants
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
@@ -349,7 +349,7 @@ int main(int argc, char** argv) {
         const size_t sr = ukernel_variants[idx_variant].ukernel.get_sr();
 
         // Get the size in bytes for the packed matrices
-        const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
+        const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon(m, k, bl, mr, kr, sr);
         const size_t rhs_packed_size =
             kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
         const size_t dst_size = ukernel_variants[idx_variant].ukernel.get_dst_size(m, n);
@@ -378,7 +378,7 @@ int main(int argc, char** argv) {
         const auto time_s = std::chrono::high_resolution_clock::now();
 
         // LHS packing
-        kai_run_lhs_quant_pack_qsi8d32p_f32(
+        kai_run_lhs_quant_pack_qsi8d32p_f32_neon(
             m, k, bl,                          // Dimensions
             mr, kr, sr, 0,                     // Packing arguments
             (const float*)lhs_native_mtx_f32,  // LHS
diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel
index 3796d88d150d8ee230cdee2421b192837c51f31f..93f4aed5a8b2a4b44c675c0a8fe6f9703dda04a7 100644
--- a/kai/ukernels/matmul/BUILD.bazel
+++ b/kai/ukernels/matmul/BUILD.bazel
@@ -21,7 +21,6 @@ package(default_visibility = ["//visibility:private"])
 # buildifier: keep sorted
 SCALAR_KERNELS = [
     "pack/kai_lhs_quant_pack_qai8dxp_f32",
-    "pack/kai_lhs_quant_pack_qsi8d32p_f32",
     "pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0",
     "pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0",
 ]
@@ -29,6 +28,7 @@ SCALAR_KERNELS = [
 # buildifier: keep sorted
 NEON_KERNELS = [
     "pack/kai_lhs_quant_pack_qsi8d32p_f32_neon",
+    "pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon",
     "pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon",
     "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0",
     "pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon",
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h
index 5c7e6e05540bc32aff66fc84f44438a31b5575db..4ecff7cb195b8c30be4c1aee086c26d16a3d4fd1 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
+/// -# @ref kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
 /// -# @ref kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon to pack the RHS NxK matrix.
 
 /// --------------------------------------------------
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h
index 990fa145505fb50f784d6e798bb048685031a28a..ec0da8f1d8caddc2a9cce9fb1ac04d037934ca20 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
+/// -# @ref kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
 /// -# @ref kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon to pack the RHS NxK matrix.
 
 /// --------------------------------------------------
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h
index 44c51a728167f356ec9baf61a02bac7ea9b520c2..85b50256bfc18c90b73a917927c90d0fe8a57db3 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix in a single step.
+/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
 /// -# @ref kai_rhs_pack_nxk_qsi4c32pscalef16qsu4c32s16s0 to pack the RHS NxK matrix.
 
 /// --------------------------------------------------
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h
index e7f4d70d5864c747a0bc65b7d6e36a8480e12990..48b09c324661ce3f42bede1a502078423a362b0a 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h
@@ -1,6 +1,5 @@
-
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +13,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix
 /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix
 
 /// --------------------------------------------------
@@ -116,7 +115,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dot
 /// @param[in]  bl             Block length. It must be 32.
 /// @param[in]  lhs_packed     The LHS packed matrix.
 ///                            When the activation are dynamically quantized, you can obtain this matrix
-///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs
+///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs
 ///                            both the dynamic quantization to 8-bit and activation packing in a single step.
 /// @param[in]  rhs_packed     The RHS packed matrix, which is obtained by calling @ref
 /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h
index 481d69abc0e4ea4585dccda367688a7cce272b04..358360c95573fa7e53f7434c593cc25f9a8e9005 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix in a single step.
+/// -# @ref kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix in a single step.
 /// -# @ref kai_rhs_pack_nxk_qsi4c32pscalef16qsu4c32s16s0 to pack the RHS NxK matrix.
 
 /// --------------------------------------------------
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h
index 593e4600cf4eaf920f1eee2c5d02ebbf676a3d42..093006733c270b1ec1fcd11429c7217914361e47 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h
@@ -1,6 +1,6 @@
 
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix
 /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix
 
 /// --------------------------------------------------
@@ -117,7 +117,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm(
 /// @param[in]  bl             Block length. It must be 32.
 /// @param[in]  lhs_packed     The LHS packed matrix.
 ///                            When the activation are dynamically quantized, you can obtain this matrix
-///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs
+///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs
 ///                            both the dynamic quantization to 8-bit and activation packing in a single step.
 /// @param[in]  rhs_packed     The RHS packed matrix, which is obtained by calling @ref
 /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h
index 53c5c933c8520e79b27662708fcfc2148f7ebac7..b5b1cc7e5a92a6907a7e4d9ded06a7e665502afc 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h
@@ -1,6 +1,6 @@
 
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -14,7 +14,7 @@ extern "C" {
 
 /// Micro-kernel dependencies
 ///
-/// -# kai_lhs_quant_pack_qsi8d32p_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_lhs_quant_pack_qsi8d32p_f32_neon to dynamically quantize and pack the LHS matrix
 /// -# kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0 to pack the RHS matrix
 
 /// --------------------------------------------------
@@ -117,7 +117,7 @@ size_t kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8m
 /// @param[in]  bl             Block length. It must be 32.
 /// @param[in]  lhs_packed     The LHS packed matrix.
 ///                            When the activation are dynamically quantized, you can obtain this matrix
-///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32 micro-kernel which performs
+///                            by calling the @ref kai_lhs_quant_pack_qsi8d32p_f32_neon micro-kernel which performs
 ///                            both the dynamic quantization to 8-bit and activation packing in a single step.
 /// @param[in]  rhs_packed     The RHS packed matrix, which is obtained by calling @ref
 /// kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0
diff --git a/kai/ukernels/matmul/pack/README.md b/kai/ukernels/matmul/pack/README.md
index 950a69ac41ae0809c1a16e6b4fd1528b0f9f6da1..da5ea7488c7087a5b3d5ca64a94a7d8b5603dcca 100644
--- a/kai/ukernels/matmul/pack/README.md
+++ b/kai/ukernels/matmul/pack/README.md
@@ -1,5 +1,5 @@
 <!--
-    SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+    SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
     SPDX-License-Identifier: Apache-2.0
 -->
@@ -48,7 +48,7 @@ For optimal cache utilization, the operands are packed for the matmul operations
 
 These packing routines are used with LHS operand of the matmul. It quantizes the input to int8 and packs them along with their scale (and offset values in asymmetric quantization) in the destination matrix.
 
-#### kai_run_lhs_quant_pack_qsi8d32p_f32()
+#### kai_run_lhs_quant_pack_qsi8d32p_f32_neon()
 
 Quantize and pack LHS matrix with per-block quantization parameters.
 
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
deleted file mode 100644
index a75daf27eb3e9561ddb0a149f07857ae41fcad5b..0000000000000000000000000000000000000000
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+++ /dev/null
@@ -1,124 +0,0 @@
-//
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
-
-#include <math.h>
-#include <stdint.h>
-
-#include "kai/kai_common.h"
-
-static const size_t kai_num_bytes_multiplier = sizeof(uint16_t);
-
-inline static size_t kai_num_bytes_per_block(size_t bl) {
-    return bl * sizeof(int8_t) + kai_num_bytes_multiplier;
-}
-
-inline static size_t kai_num_blocks_per_row(size_t k, size_t bl) {
-    KAI_ASSERT((k % bl) == 0);
-    return k / bl;
-}
-
-inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_t bl) {
-    KAI_UNUSED(kr);
-    return mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block(bl);
-}
-
-size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr) {
-    KAI_UNUSED(mr);
-    return 1;
-}
-
-size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_stride) {
-    return m_idx * lhs_stride;
-}
-
-size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32(
-    size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
-    KAI_ASSUME((k % 2) == 0);
-    KAI_ASSUME((k % kr) == 0);
-    KAI_ASSUME((k % bl) == 0);
-
-    KAI_UNUSED(sr);
-    KAI_UNUSED(kr);
-
-    return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, bl);
-}
-
-size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(
-    size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
-    KAI_ASSUME((k % 2) == 0);
-    KAI_ASSUME((k % kr) == 0);
-    KAI_ASSUME((k % bl) == 0);
-
-    KAI_UNUSED(sr);
-    KAI_UNUSED(kr);
-
-    const size_t num_rows = kai_roundup(m, mr) / mr;
-
-    return num_rows * kai_lhs_packed_stride(k, mr, kr, bl);
-}
-
-void kai_run_lhs_quant_pack_qsi8d32p_f32(
-    size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
-    size_t lhs_stride, void* lhs_packed) {
-    if (m == 0) {
-        return;
-    }
-
-    const size_t num_rows = m;
-    const size_t k_block_len = kr / sr;
-    const size_t lhs_packed_stride = kai_lhs_packed_stride(k, mr, kr, bl);
-    const size_t num_blocks_per_row = kai_num_blocks_per_row(k, bl);
-    const size_t num_bytes_per_block = kai_num_bytes_per_block(bl);
-
-    for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) {
-        const float* src_ptr = (const float*)((const uint8_t*)lhs + (row_idx + m_idx_start) * lhs_stride);
-
-        for (size_t b = 0; b < num_blocks_per_row; ++b) {
-            float abs_max = 0.0F;
-
-            const size_t dst_x = ((row_idx + m_idx_start) % mr);
-            int8_t* dst_ptr = (int8_t*)lhs_packed + (b * mr) * num_bytes_per_block;
-
-            for (size_t idx_v = 0; idx_v < bl; ++idx_v) {
-                const float val = src_ptr[idx_v];
-                abs_max = KAI_MAX(abs_max, fabsf(val));
-            }
-
-            // Calculate scale and reciprocal
-            const float scale = abs_max / ((1 << 7) - 1);
-            const float rep_scale = scale ? 1.0F / scale : 0.0F;
-
-            *((uint16_t*)(dst_ptr + dst_x * kai_num_bytes_multiplier)) = kai_cast_f16_f32(scale);
-            dst_ptr += mr * kai_num_bytes_multiplier;
-
-            dst_ptr += dst_x * k_block_len * sizeof(int8_t);
-
-            // Quantize and pack the block
-            for (size_t k_idx = 0; k_idx < bl; k_idx += k_block_len) {
-                for (size_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
-                    // Clamp at the last valid k-index
-                    const size_t k_idx_start = KAI_MIN(k_idx + k_block_idx, k - 1);
-
-                    const float src0_0 = *(src_ptr + k_idx_start);
-
-                    // Scale the values
-                    int32_t v0_s32 = (int32_t)(roundf(src0_0 * rep_scale));
-
-                    *dst_ptr = (int8_t)v0_s32;
-                    dst_ptr += sizeof(int8_t);
-                }
-                dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
-            }
-
-            src_ptr += bl;
-        }
-        // Move to the next row if we have interleaved all Mr rows
-        if ((((row_idx + 1) + m_idx_start) % mr) == 0) {
-            lhs_packed = (void*)((int8_t*)lhs_packed + lhs_packed_stride);
-        }
-    }
-}
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
index 70298754445c783e55af8d34724f5d8cfc013ca4..07c6355fea9b21210a84bc5d4dbb19d8b197d632 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -28,7 +28,6 @@ inline static size_t kai_num_blocks_per_row(size_t k, size_t bl) {
 
 inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_t bl) {
     KAI_UNUSED(kr);
-
     return mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block(bl);
 }
 
@@ -46,12 +45,10 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon(
     KAI_ASSUME((k % 2) == 0);
     KAI_ASSUME((k % kr) == 0);
     KAI_ASSUME((k % bl) == 0);
-    KAI_ASSUME((m_idx % mr) == 0);
 
     KAI_UNUSED(sr);
     KAI_UNUSED(kr);
 
-    // The scales are stored after all the mr packed quantized values
     return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, bl);
 }
 
@@ -66,66 +63,67 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon(
 
     const size_t num_rows = kai_roundup(m, mr) / mr;
 
-    return (num_rows * kai_lhs_packed_stride(k, mr, kr, bl));
+    return num_rows * kai_lhs_packed_stride(k, mr, kr, bl);
 }
 
 void kai_run_lhs_quant_pack_qsi8d32p_f32_neon(
     size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
     size_t lhs_stride, void* lhs_packed) {
-    KAI_ASSUME((bl % kr) == 0);
-    KAI_ASSUME((k % bl) == 0);
-    KAI_ASSUME(kr == 4);
-    KAI_ASSUME(bl == 32);
-    KAI_UNUSED(sr);
-    KAI_UNUSED(m_idx_start);
-    KAI_UNUSED(lhs_stride);
-
     if (m == 0) {
         return;
     }
 
-    const size_t num_blocks = kai_num_blocks_per_row(k, bl);
+    const size_t num_rows = m;
+    const size_t k_block_len = kr / sr;
     const size_t lhs_packed_stride = kai_lhs_packed_stride(k, mr, kr, bl);
+    const size_t num_blocks_per_row = kai_num_blocks_per_row(k, bl);
+    const size_t num_bytes_per_block = kai_num_bytes_per_block(bl);
 
-    const float* lhs_ptr = lhs;
-    int8_t* lhs_packed_start_ptr = lhs_packed;
-
-    for (size_t m_idx = 0; m_idx < m; m_idx++) {
-        int8_t* lhs_packed_ptr = lhs_packed_start_ptr;
-        uint16_t* lhs_packed_scales =
-            (uint16_t*)(lhs_packed_ptr + lhs_packed_stride - ((mr * num_blocks) * kai_num_bytes_multiplier));
+    for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) {
+        const float* src_ptr = (const float*)((const uint8_t*)lhs + (row_idx + m_idx_start) * lhs_stride);
 
-        lhs_packed_ptr += (m_idx % mr) * kr;
-        lhs_packed_scales += (m_idx % mr);
+        for (size_t b = 0; b < num_blocks_per_row; ++b) {
+            float abs_max = 0.0F;
 
-        for (size_t block_idx = 0; block_idx < num_blocks; block_idx++) {
-            // Maximum absolute value of the block elements
-            float amax = 0.0F;
+            const size_t dst_x = ((row_idx + m_idx_start) % mr);
+            int8_t* dst_ptr = (int8_t*)lhs_packed + (b * mr) * num_bytes_per_block;
 
-            for (size_t bl_idx = 0; bl_idx < bl; bl_idx++) {
-                amax = KAI_MAX(amax, fabsf(lhs_ptr[bl_idx]));
+            for (size_t idx_v = 0; idx_v < bl; ++idx_v) {
+                const float val = src_ptr[idx_v];
+                abs_max = KAI_MAX(abs_max, fabsf(val));
             }
 
-            const float sf = amax / ((1 << 7) - 1);
+            // Calculate scale and reciprocal
+            const float scale = abs_max / ((1 << 7) - 1);
+            const float rep_scale = scale ? 1.0F / scale : 0.0F;
 
-            const float sf_inv = sf ? 1.0F / sf : 0.0F;
+            *((uint16_t*)(dst_ptr + dst_x * kai_num_bytes_multiplier)) = kai_cast_f16_f32(scale);
+            dst_ptr += mr * kai_num_bytes_multiplier;
 
-            for (size_t bl_idx = 0; bl_idx < bl; bl_idx += kr) {
-                for (size_t kr_idx = 0; kr_idx < kr; ++kr_idx) {
-                    int32_t v0_s32 = (int32_t)(roundf(lhs_ptr[kr_idx] * sf_inv));
-                    lhs_packed_ptr[kr_idx] = (int8_t)v0_s32;
+            dst_ptr += dst_x * k_block_len * sizeof(int8_t);
+
+            // Quantize and pack the block
+            for (size_t k_idx = 0; k_idx < bl; k_idx += k_block_len) {
+                for (size_t k_block_idx = 0; k_block_idx < k_block_len; ++k_block_idx) {
+                    // Clamp at the last valid k-index
+                    const size_t k_idx_start = KAI_MIN(k_idx + k_block_idx, k - 1);
+
+                    const float src0_0 = *(src_ptr + k_idx_start);
+
+                    // Scale the values
+                    int32_t v0_s32 = (int32_t)(roundf(src0_0 * rep_scale));
+
+                    *dst_ptr = (int8_t)v0_s32;
+                    dst_ptr += sizeof(int8_t);
                 }
-                lhs_ptr += kr;
-                lhs_packed_ptr += mr * kr;
+                dst_ptr += (mr - 1) * k_block_len * sizeof(int8_t);
             }
 
-            // Num_blocks (rows) x Mr (cols)
-            lhs_packed_scales[0] = kai_cast_f16_f32(sf);
-
-            lhs_packed_scales += mr;
+            src_ptr += bl;
         }
-        if (((m_idx + 1) % mr) == 0) {
-            lhs_packed_start_ptr += lhs_packed_stride;
+        // Move to the next row if we have interleaved all Mr rows
+        if ((((row_idx + 1) + m_idx_start) % mr) == 0) {
+            lhs_packed = (void*)((int8_t*)lhs_packed + lhs_packed_stride);
         }
     }
 }
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h
index 837d95bde5638a20ec1cec2209ffdb28e7aaa0b2..d5dac6d18ea893e68a61415b2fbb83b887fd95fb 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h
@@ -1,11 +1,12 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 
 #include <stddef.h>
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,7 +36,7 @@ size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon(size_t m_idx, size_t
 ///
 /// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
 ///
-/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of mr.
+/// @param[in] m_idx Row index in the LHS matrix (not packed).
 /// @param[in] k     Total number of columns in the LHS matrix (not packed).
 /// @param[in] bl    The block length.
 /// @param[in] mr    The number of M rows to interleave on the same output row.
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bba9838a1c4c88e347bcb92c12b6676587d5e41
--- /dev/null
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.c
@@ -0,0 +1,132 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#if !defined(__aarch64__)
+#error This file must be compiled for AArch64.
+#else  // Architectural features check.
+
+#include "kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h"
+
+#include <math.h>
+#include <stdint.h>
+
+#include "kai/kai_common.h"
+
+static const size_t kai_num_bytes_multiplier = sizeof(uint16_t);
+
+inline static size_t kai_num_bytes_per_block(size_t bl) {
+    return bl * sizeof(int8_t) + kai_num_bytes_multiplier;
+}
+
+inline static size_t kai_num_blocks_per_row(size_t k, size_t bl) {
+    KAI_ASSERT((k % bl) == 0);
+    return k / bl;
+}
+
+inline static size_t kai_lhs_packed_stride(size_t k, size_t mr, size_t kr, size_t bl) {
+    KAI_UNUSED(kr);
+
+    return mr * kai_num_blocks_per_row(k, bl) * kai_num_bytes_per_block(bl);
+}
+
+size_t kai_get_m_step_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t mr) {
+    KAI_UNUSED(mr);
+    return 1;
+}
+
+size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t m_idx, size_t lhs_stride) {
+    return m_idx * lhs_stride;
+}
+
+size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
+    size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
+    KAI_ASSUME((k % 2) == 0);
+    KAI_ASSUME((k % kr) == 0);
+    KAI_ASSUME((k % bl) == 0);
+    KAI_ASSUME((m_idx % mr) == 0);
+
+    KAI_UNUSED(sr);
+    KAI_UNUSED(kr);
+
+    // The scales are stored after all the mr packed quantized values
+    return (m_idx / mr) * kai_lhs_packed_stride(k, mr, kr, bl);
+}
+
+size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
+    size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
+    KAI_ASSUME((k % 2) == 0);
+    KAI_ASSUME((k % kr) == 0);
+    KAI_ASSUME((k % bl) == 0);
+
+    KAI_UNUSED(sr);
+    KAI_UNUSED(kr);
+
+    const size_t num_rows = kai_roundup(m, mr) / mr;
+
+    return (num_rows * kai_lhs_packed_stride(k, mr, kr, bl));
+}
+
+void kai_run_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
+    size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
+    size_t lhs_stride, void* lhs_packed) {
+    KAI_ASSUME((bl % kr) == 0);
+    KAI_ASSUME((k % bl) == 0);
+    KAI_ASSUME(kr == 4);
+    KAI_ASSUME(bl == 32);
+    KAI_UNUSED(sr);
+    KAI_UNUSED(m_idx_start);
+    KAI_UNUSED(lhs_stride);
+
+    if (m == 0) {
+        return;
+    }
+
+    const size_t num_blocks = kai_num_blocks_per_row(k, bl);
+    const size_t lhs_packed_stride = kai_lhs_packed_stride(k, mr, kr, bl);
+
+    const float* lhs_ptr = lhs;
+    int8_t* lhs_packed_start_ptr = lhs_packed;
+
+    for (size_t m_idx = 0; m_idx < m; m_idx++) {
+        int8_t* lhs_packed_ptr = lhs_packed_start_ptr;
+        uint16_t* lhs_packed_scales =
+            (uint16_t*)(lhs_packed_ptr + lhs_packed_stride - ((mr * num_blocks) * kai_num_bytes_multiplier));
+
+        lhs_packed_ptr += (m_idx % mr) * kr;
+        lhs_packed_scales += (m_idx % mr);
+
+        for (size_t block_idx = 0; block_idx < num_blocks; block_idx++) {
+            // Maximum absolute value of the block elements
+            float amax = 0.0F;
+
+            for (size_t bl_idx = 0; bl_idx < bl; bl_idx++) {
+                amax = KAI_MAX(amax, fabsf(lhs_ptr[bl_idx]));
+            }
+
+            const float sf = amax / ((1 << 7) - 1);
+
+            const float sf_inv = sf ? 1.0F / sf : 0.0F;
+
+            for (size_t bl_idx = 0; bl_idx < bl; bl_idx += kr) {
+                for (size_t kr_idx = 0; kr_idx < kr; ++kr_idx) {
+                    int32_t v0_s32 = (int32_t)(roundf(lhs_ptr[kr_idx] * sf_inv));
+                    lhs_packed_ptr[kr_idx] = (int8_t)v0_s32;
+                }
+                lhs_ptr += kr;
+                lhs_packed_ptr += mr * kr;
+            }
+
+            // Num_blocks (rows) x Mr (cols)
+            lhs_packed_scales[0] = kai_cast_f16_f32(sf);
+
+            lhs_packed_scales += mr;
+        }
+        if (((m_idx + 1) % mr) == 0) {
+            lhs_packed_start_ptr += lhs_packed_stride;
+        }
+    }
+}
+#endif  // Architectural features check.
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h
similarity index 85%
rename from kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h
rename to kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h
index 2baef93f90549e8bd07f4af0f087fb95df8403da..3e28ec9958ba1d877f5af2b321c4f28a14fd1c6c 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h
@@ -1,12 +1,11 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 
 #include <stddef.h>
-#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,7 +18,7 @@ extern "C" {
 /// @param[in] mr The number of M rows to interleave on the same output row.
 ///
 /// @return the m step value
-size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr);
+size_t kai_get_m_step_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t mr);
 
 /// Gets the offset in bytes for the LHS matrix (not packed)
 ///
@@ -29,14 +28,14 @@ size_t kai_get_m_step_lhs_quant_pack_qsi8d32p_f32(size_t mr);
 /// @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed)
 ///
 /// @return the offset in bytes to the LHS matrix
-size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_stride);
+size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(size_t m_idx, size_t lhs_stride);
 
 /// Gets the offset in bytes for the packed LHS matrix,
 /// which contains the packed 8-bit quantized symmetric per-block (qsi8d32) values.
 ///
 /// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
 ///
-/// @param[in] m_idx Row index in the LHS matrix (not packed).
+/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of mr.
 /// @param[in] k     Total number of columns in the LHS matrix (not packed).
 /// @param[in] bl    The block length.
 /// @param[in] mr    The number of M rows to interleave on the same output row.
@@ -44,7 +43,7 @@ size_t kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32(size_t m_idx, size_t lhs_s
 /// @param[in] sr    The number of kr splits. It can be 1 (no splits) up to kr.
 ///
 /// @return the offset in bytes to the packed LHS matrix
-size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32(
+size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
     size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
 
 /// Gets the size in bytes for the quantized and packed LHS matrix
@@ -58,7 +57,7 @@ size_t kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32(
 /// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr.
 ///
 /// @return the packed LHS matrix size in bytes
-size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(
+size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
     size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
 
 /// Run the micro-kernel to quantize and pack the LHS matrix.
@@ -75,7 +74,7 @@ size_t kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(
 /// @param[in]  lhs         LHS matrix.
 /// @param[in]  lhs_stride  Stride in bytes between two rows of LHS.
 /// @param[out] lhs_packed  The quantized and packed LHS matrix.
-void kai_run_lhs_quant_pack_qsi8d32p_f32(
+void kai_run_lhs_quant_pack_qsi8d32pmrx4_f32_neon(
     size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
     size_t lhs_stride, void* lhs_packed);
 
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index 153decb53e1c839e68eafbadf67c60dcf9c387d5..9da7e571aae149b65757912eba2f9d8a561e81d6 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -24,8 +24,8 @@
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
-#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
+#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pmrx4_f32_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
 #include "test/common/cpu_info.hpp"
@@ -44,14 +44,14 @@
 namespace kai::test {
 
 // Interface for the LHS and RHS packed size and packing functions
-using kai_get_lhs_packed_size_func_t = decltype(&kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32);
+using kai_get_lhs_packed_size_func_t = decltype(&kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon);
 using kai_get_rhs_packed_size_func_t = decltype(&kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0);
-using kai_get_lhs_packed_offset_func_t = decltype(&kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32);
+using kai_get_lhs_packed_offset_func_t = decltype(&kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon);
 using kai_get_rhs_packed_offset_func_t =
     decltype(&kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0);
-using kai_get_lhs_offset_func_t = decltype(&kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32);
+using kai_get_lhs_offset_func_t = decltype(&kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon);
 using kai_get_rhs_offset_func_t = decltype(&kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0);
-using kai_run_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32);
+using kai_run_lhs_pack_func_t = decltype(&kai_run_lhs_quant_pack_qsi8d32p_f32_neon);
 using kai_run_rhs_pack_func_t = decltype(&kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0);
 
 // Micro-kernel interface
@@ -71,25 +71,25 @@ static const std::array<
     7>
     variants_kai_matmul_clamp_f32_qsi8d32p_qsi4c32p = {
         {UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32,
+             clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32_neon,
              rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32,
+             clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm, cpu_has_i8mm, lhs_quant_pack_qsi8d32p_f32_neon,
              rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32,
+             clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon,
              rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32,
+             clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon,
              rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32,
+             clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod, cpu_has_dotprod, lhs_quant_pack_qsi8d32p_f32_neon,
              rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa, cpu_has_sme2, lhs_quant_pack_qsi8d32p_f32_neon,
-             rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon),
+             clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa, cpu_has_sme2,
+             lhs_quant_pack_qsi8d32pmrx4_f32_neon, rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon),
          UKERNEL_MATMUL_PACK_VARIANT(
-             clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, cpu_has_sme2, lhs_quant_pack_qsi8d32p_f32_neon,
+             clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot, cpu_has_sme2, lhs_quant_pack_qsi8d32pmrx4_f32_neon,
              rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon)}};
 
 using MatMulTestParams_withPortion = std::tuple<size_t, MatMulShape, MatrixPortion>;