diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
index 4ce0e7b4d7392f3f4efb49a78f20b653f6efe908..16b95b7c9f304fb4b6367566c98852238a5f0b62 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -628,6 +628,15 @@ int main() {
             //------------------------------------
             //------------------------------------
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
+                std::cout << "TEST[" << idx_variant << "]: Dynamic quantization + matmul" << std::endl;
+                std::cout << "- ukernel: " << ukernel_variants[idx_variant].name << std::endl;
+                // Skip gemv kernels for non-gemv shapes
+                if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+                    std::cout << "Status: SKIPPED - GEMV kernels are optimized for m=1 only" << std::endl;
+                    std::cout << "------------" << std::endl;
+                    continue;
+                }
+
                 // Get the packing parameters
                 const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
                 const size_t nr = ukernel_variants[idx_variant].ukernel.get_nr();
@@ -735,8 +744,6 @@ int main() {
                 const bool is_valid =
                     is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
-                std::cout << "TEST[" << idx_variant << "]: Dynamic quantization + matmul" << std::endl;
-                std::cout << "- ukernel: " << ukernel_variants[idx_variant].name << std::endl;
                 if (is_valid) {
                     std::cout << "- Status: PASSED" << std::endl;
                     std::cout << "- Performance: " << elap.count() << " us" << std::endl;
diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
index 53e64664a602565caa8d10643043a74f72d24416..ed2623243ebc5c8b6d33f8fa3e72bc56aa890d05 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -606,6 +606,12 @@ int main(int argc, char** argv) {
             //------------------------------------
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
                 std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
+                // Skip gemv kernels for non-gemv shapes
+                if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+                    std::cout << "TEST[" << idx_variant << "] = SKIPPED" << std::endl;
+                    std::cout << "- GEMV kernels are optimized for m=1 only, but here m=" << m << std::endl;
+                    continue;
+                }
 
                 // Get the packing parameters
                 const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
@@ -692,9 +698,9 @@ int main(int argc, char** argv) {
                     is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
                 if (is_valid) {
-                    printf("TEST[%ld] = PASSED\n", idx_variant);
+                    std::cout << "TEST[" << idx_variant << "] = PASSED" << std::endl;
                 } else {
-                    printf("TEST[%ld] = FAILED\n", idx_variant);
+                    std::cout << "TEST[" << idx_variant << "] = FAILED" << std::endl;
                 }
 
                 delete[] lhs_packed_mtx_qa8dx;
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
index 6d992b625e4387f7d5e7230261764ce11119e545..9d233f33dc4a89b7bb0f44ca6ab4e07e0f814411 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -341,6 +341,12 @@ int main(int argc, char** argv) {
     //------------------------------------
     for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
         std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
+        // Skip gemv kernels for non-gemv shapes
+        if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+            std::cout << "TEST[" << idx_variant << "] = SKIPPED" << std::endl;
+            std::cout << "- GEMV kernels are optimized for m=1 only, but here m=" << m << std::endl;
+            continue;
+        }
 
         // Get the packing parameters
         const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
@@ -415,10 +421,10 @@ int main(int argc, char** argv) {
             is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
         if (is_valid) {
-            printf("TEST[%ld] = PASSED\n", idx_variant);
+            std::cout << "TEST[" << idx_variant << "] = PASSED" << std::endl;
             std::cout << "- Performance: " << elap.count() << " us" << std::endl;
         } else {
-            printf("TEST[%ld] = FAILED\n", idx_variant);
+            std::cout << "TEST[" << idx_variant << "] = FAILED" << std::endl;
         }
         delete[] lhs_packed_mtx_qs8d32;
         delete[] rhs_packed_mtx_qs4c32;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
index c7fac42187d8d0a66b25afc022dcf92cadf77317..843b5c826c4f125b78d11763b61a1d18d2a7022a 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
index ee62e401168447ef1060c7149ae393fcc14439af..3974bccfc21be0d7239214e01afc447f54851379 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
index 3fc3cfc6573ec1afea035b659c2568cb71a15c69..67e2bd5fa3850384786722dba2f48710626e995a 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
index ad9327f4a1b1e0df7508a572ba7efae7aa0e7982..b71cd9558e1194f851ec93a9475d45427d5939f6 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
index 4738d9aa11a0ffbffc797f778bd35627f383bd84..06318517956d153ddfd14eae33c871a1d0d79879 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
index b8e2f832a6e19f99910fd6996d6c9365a7e37b98..45b54cdf311add31ead21bcd4aa70823ae1837d7 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
@@ -115,6 +115,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(
     float* dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0 || n == 0 || k == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
index ab136e1a6033aa418b2d155051caf7f834bbc58a..d56a7205048f9190ed1033605f4d597deb21eacb 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
@@ -126,6 +126,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
index 03cf6abb0b7a18ec8cb9b44170580e91a3aefaa7..20b178f53652480e01d536ad7d37ad90db74429e 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
@@ -103,6 +103,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(
     float* restrict dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
index c9122f5a5d467765c8426f899388e7c607597289..4bd47c15047ce31619e1a46ec40e8245e3660d4e 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
@@ -103,6 +103,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(
     float* restrict dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
index bd5246fa692d7bf88fd9fc038fb34c99edab431a..1d159b4f846fb500835e0af40ff3c65ab6364448 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
@@ -124,6 +124,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
index 784a1165aae26d228161c56b07d656a0db096b96..4367df9f487a997ee75e7fb0bc2cc4444616d3a0 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
@@ -124,6 +124,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
index 69096ff732778cd4898d91568cda70d008d86b93..e4f2bcfdeade71e988f1365dcbb666e64e440e7d 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
@@ -115,6 +115,7 @@ void kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod(
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSUME(bl == kai_bl);
     KAI_ASSUME(k % kai_bl == 0);
+    KAI_ASSUME(m == 1);
     KAI_ASSUME(dst_stride_col == sizeof(float));
 
     if (m == 0) {
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index 90000e97cbff905c002827ed40d4dac1a5c13f88..96f67ffea7ace315a60764114b5266d0a40dd16f 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -173,6 +173,13 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -193,9 +200,6 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
         ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -307,6 +311,13 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs_transposed = fill_random<float>(N * K, seed + 1);
@@ -339,9 +350,6 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
         ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index c0018dd600342fc7f25cac1a42ca72e99d19d96c..ea8eb71539cbc2781973c4085fb6427426bcb2ae 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -291,6 +291,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -310,9 +317,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -415,6 +419,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -434,9 +445,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -542,6 +550,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -572,9 +587,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -670,6 +682,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -701,9 +720,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index 4dd69e4ba9179f547595932e5ea708d43b096950..208c9a5c69fd28789eeb38cac9e0a28756c4b983 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -112,6 +112,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -131,9 +138,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -226,6 +230,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -256,9 +267,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index ce5c6ebce2e3a8087d452500465c9c0a14ee5334..17e4ba448314b03cc155ee4f57c4fd23bebe96c1 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -179,13 +179,13 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
     const auto kr = ukernel_variant.ukernel.interface.get_kr();
     const auto sr = ukernel_variant.ukernel.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
-    }
-
-    auto m_step = ukernel_variant.ukernel.interface.get_m_step();
+    const auto m_step = ukernel_variant.ukernel.interface.get_m_step();
     ASSERT_TRUE(m_step % mr == 0);
 
+    if (m_step == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
+    }
+
     auto n_step = ukernel_variant.ukernel.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);