From 674b0056052382ed1511118c0984b0d1eb8bb97b Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Tue, 25 Feb 2025 09:48:52 +0000
Subject: [PATCH 1/8] add KAI_ASSUME(m==1) to all gemv ukernels

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 ...ai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c | 1 +
 ...ai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c | 1 +
 ...matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c | 1 +
 ...ai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c | 1 +
 ...matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c | 1 +
 ...ai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c | 1 +
 ...kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c | 1 +
 ..._matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c | 1 +
 ..._matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c | 1 +
 ...kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c | 1 +
 ...kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c | 1 +
 11 files changed, 11 insertions(+)

diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
index c7fac421..843b5c82 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
index ee62e401..3974bccf 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
index 3fc3cfc6..67e2bd5f 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
index ad9327f4..b71cd955 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
index 4738d9aa..06318517 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c
@@ -159,6 +159,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod(
     KAI_ASSUME(dst_stride_col == sizeof(float));
     KAI_ASSUME((k % bl) == 0);
     KAI_ASSUME((bl % kai_bl) == 0);
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
index b8e2f832..45b54cdf 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.c
@@ -115,6 +115,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot(
     float* dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0 || n == 0 || k == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
index ab136e1a..d56a7205 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.c
@@ -126,6 +126,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
index 03cf6abb..20b178f5 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
@@ -103,6 +103,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(
     float* restrict dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
index c9122f5a..4bd47c15 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
@@ -103,6 +103,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(
     float* restrict dst,  // NOLINT(readability-non-const-parameter)
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSERT(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
index bd5246fa..1d159b4f 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
@@ -124,6 +124,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
index 784a1165..4367df9f 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c
@@ -124,6 +124,7 @@ void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod(
     float scalar_min,                 //
     float scalar_max) {
     KAI_ASSUME(dst_stride_col == sizeof(float));
+    KAI_ASSUME(m == 1);
 
     if (m == 0) {
         return;
-- 
GitLab


From 405cc8387d7666d2d28b79d14ca34720bc4ecc5d Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Fri, 28 Feb 2025 13:08:02 +0000
Subject: [PATCH 2/8] add check for extra ukernel

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 ...atmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
index 69096ff7..e4f2bcfd 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
@@ -115,6 +115,7 @@ void kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod(
     size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) {
     KAI_ASSUME(bl == kai_bl);
     KAI_ASSUME(k % kai_bl == 0);
+    KAI_ASSUME(m == 1);
     KAI_ASSUME(dst_stride_col == sizeof(float));
 
     if (m == 0) {
-- 
GitLab


From c9b54a6a3ee49daa9d67027b689aa6549df81f20 Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Fri, 28 Feb 2025 15:43:21 +0000
Subject: [PATCH 3/8] modify example tests to skip gemv kernels for non-gemv
 shapes

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 .../matmul_clamp_f32_qai8dxp_qsi4c32p.cpp           | 13 ++++++++++---
 .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp            |  7 ++++++-
 .../matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp          |  7 ++++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
index 4ce0e7b4..64e6d34b 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -628,6 +628,15 @@ int main() {
             //------------------------------------
             //------------------------------------
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
+                std::cout << "TEST[" << idx_variant << "]: Dynamic quantization + matmul" << std::endl;
+                std::cout << "- ukernel: " << ukernel_variants[idx_variant].name << std::endl;
+                // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+                if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+                    std::cout << "Status: SKIPPED" << std::endl;
+                    std::cout << "------------" << std::endl;
+                    continue;
+                }
+
                 // Get the packing parameters
                 const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
                 const size_t nr = ukernel_variants[idx_variant].ukernel.get_nr();
@@ -735,8 +744,6 @@ int main() {
                 const bool is_valid =
                     is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
-                std::cout << "TEST[" << idx_variant << "]: Dynamic quantization + matmul" << std::endl;
-                std::cout << "- ukernel: " << ukernel_variants[idx_variant].name << std::endl;
                 if (is_valid) {
                     std::cout << "- Status: PASSED" << std::endl;
                     std::cout << "- Performance: " << elap.count() << " us" << std::endl;
diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
index 53e64664..ef529954 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -606,6 +606,11 @@ int main(int argc, char** argv) {
             //------------------------------------
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
                 std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
+                // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+                if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+                    printf("TEST[%ld] = SKIPPED\n", idx_variant);
+                    continue;
+                }
 
                 // Get the packing parameters
                 const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
index 6d992b62..118f14ca 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -341,6 +341,11 @@ int main(int argc, char** argv) {
     //------------------------------------
     for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
         std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
+        // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+        if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
+            printf("TEST[%ld] = SKIPPED\n", idx_variant);
+            continue;
+        }
 
         // Get the packing parameters
         const size_t mr = ukernel_variants[idx_variant].ukernel.get_mr();
-- 
GitLab


From 79ef006b4a23794bd9c0227e59d2e6ee3e07ecd8 Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Fri, 28 Feb 2025 17:17:04 +0000
Subject: [PATCH 4/8] skip gemv unit tests for unsupported shapes

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 .../matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp   |  8 ++++++++
 .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp    | 16 ++++++++++++++++
 .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp    |  8 ++++++++
 3 files changed, 32 insertions(+)

diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index 90000e97..a6db73c5 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -173,6 +173,10 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -307,6 +311,10 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs_transposed = fill_random<float>(N * K, seed + 1);
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index c0018dd6..5c84b44b 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -291,6 +291,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -415,6 +419,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -542,6 +550,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -670,6 +682,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index 4dd69e4b..8f0fa8b9 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -112,6 +112,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
@@ -226,6 +230,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
+    if (mr == 1 && M > 1) {
+        GTEST_SKIP() << "Kernel does not support M != 1";
+    }
+
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
-- 
GitLab


From cfa05472e1149f616b0c760c946dd7bdee7ef377 Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Mon, 3 Mar 2025 11:21:55 +0000
Subject: [PATCH 5/8] add clarity to output of example tests when skipped

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 .../matmul_clamp_f32_qai8dxp_qsi4c32p.cpp                     | 4 ++--
 .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp                      | 4 ++--
 .../matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp                    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
index 64e6d34b..16b95b7c 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/matmul_clamp_f32_qai8dxp_qsi4c32p.cpp
@@ -630,9 +630,9 @@ int main() {
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
                 std::cout << "TEST[" << idx_variant << "]: Dynamic quantization + matmul" << std::endl;
                 std::cout << "- ukernel: " << ukernel_variants[idx_variant].name << std::endl;
-                // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+                // Skip gemv kernels for non-gemv shapes
                 if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
-                    std::cout << "Status: SKIPPED" << std::endl;
+                    std::cout << "Status: SKIPPED - GEMV kernels are optimized for m=1 only" << std::endl;
                     std::cout << "------------" << std::endl;
                     continue;
                 }
diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
index ef529954..be609b28 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
@@ -606,9 +606,9 @@ int main(int argc, char** argv) {
             //------------------------------------
             for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
                 std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
-                // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+                // Skip gemv kernels for non-gemv shapes
                 if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
-                    printf("TEST[%ld] = SKIPPED\n", idx_variant);
+                    printf("TEST[%ld] = SKIPPED (GEMV kernels optimized for m=1 only)\n", idx_variant);
                     continue;
                 }
 
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
index 118f14ca..60185ee1 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
@@ -341,9 +341,9 @@ int main(int argc, char** argv) {
     //------------------------------------
     for (size_t idx_variant = 0; idx_variant < num_ukernel_variants; ++idx_variant) {
         std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
-        // Skip gemv kernels for non-gemv shapes. Gemv kernels are optimized for m=1 only
+        // Skip gemv kernels for non-gemv shapes
         if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
-            printf("TEST[%ld] = SKIPPED\n", idx_variant);
+            printf("TEST[%ld] = SKIPPED (GEMV kernels optimized for m=1 only)\n", idx_variant);
             continue;
         }
 
-- 
GitLab


From c4b01701884c286ad498d656f52808b15a3c15d1 Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Mon, 3 Mar 2025 14:18:51 +0000
Subject: [PATCH 6/8] update print statements in examples to C++ style

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp                   | 7 ++++---
 .../matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp                 | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
index be609b28..ed262324 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
@@ -608,7 +608,8 @@ int main(int argc, char** argv) {
                 std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
                 // Skip gemv kernels for non-gemv shapes
                 if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
-                    printf("TEST[%ld] = SKIPPED (GEMV kernels optimized for m=1 only)\n", idx_variant);
+                    std::cout << "TEST[" << idx_variant << "] = SKIPPED" << std::endl;
+                    std::cout << "- GEMV kernels are optimized for m=1 only, but here m=" << m << std::endl;
                     continue;
                 }
 
@@ -697,9 +698,9 @@ int main(int argc, char** argv) {
                     is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
                 if (is_valid) {
-                    printf("TEST[%ld] = PASSED\n", idx_variant);
+                    std::cout << "TEST[" << idx_variant << "] = PASSED" << std::endl;
                 } else {
-                    printf("TEST[%ld] = FAILED\n", idx_variant);
+                    std::cout << "TEST[" << idx_variant << "] = FAILED" << std::endl;
                 }
 
                 delete[] lhs_packed_mtx_qa8dx;
diff --git a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
index 60185ee1..9d233f33 100644
--- a/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
+++ b/examples/matmul_clamp_f32_qsi8d32p_qsi4c32p/matmul_clamp_f32_qsi8d32p_qsi4c32p.cpp
@@ -343,7 +343,8 @@ int main(int argc, char** argv) {
         std::cout << "Testing " << ukernel_variants[idx_variant].name << std::endl;
         // Skip gemv kernels for non-gemv shapes
         if ((m > 1) && (ukernel_variants[idx_variant].ukernel.get_m_step() == 1)) {
-            printf("TEST[%ld] = SKIPPED (GEMV kernels optimized for m=1 only)\n", idx_variant);
+            std::cout << "TEST[" << idx_variant << "] = SKIPPED" << std::endl;
+            std::cout << "- GEMV kernels are optimized for m=1 only, but here m=" << m << std::endl;
             continue;
         }
 
@@ -420,10 +421,10 @@ int main(int argc, char** argv) {
             is_output_correct(m, n, 0.0001f, (const float*)dst_ref_mtx_f32, (const float*)dst_act_mtx_f32);
 
         if (is_valid) {
-            printf("TEST[%ld] = PASSED\n", idx_variant);
+            std::cout << "TEST[" << idx_variant << "] = PASSED" << std::endl;
             std::cout << "- Performance: " << elap.count() << " us" << std::endl;
         } else {
-            printf("TEST[%ld] = FAILED\n", idx_variant);
+            std::cout << "TEST[" << idx_variant << "] = FAILED" << std::endl;
         }
         delete[] lhs_packed_mtx_qs8d32;
         delete[] rhs_packed_mtx_qs4c32;
-- 
GitLab


From f492b865b5803ab1abcead726c65d78db6e02131 Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Mon, 3 Mar 2025 14:38:53 +0000
Subject: [PATCH 7/8] print current value of M when skipping unit tests because
 M > 1

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp  | 4 ++--
 test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp   | 8 ++++----
 test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp   | 4 ++--
 test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index a6db73c5..5925a18f 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -174,7 +174,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
@@ -312,7 +312,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index 5c84b44b..8c418a92 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -292,7 +292,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
@@ -420,7 +420,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
@@ -551,7 +551,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
@@ -683,7 +683,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index 8f0fa8b9..365da8e3 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -113,7 +113,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
@@ -231,7 +231,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     const auto sr = ukernel_variant.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     // Generates input data.
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index ce5c6ebc..1236677f 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -180,7 +180,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
     const auto sr = ukernel_variant.ukernel.interface.get_sr();
 
     if (mr == 1 && M > 1) {
-        GTEST_SKIP() << "Kernel does not support M != 1";
+        GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
     auto m_step = ukernel_variant.ukernel.interface.get_m_step();
-- 
GitLab


From 2678c9fb996ce7b0f9723156e08ea8ebb6de4b5f Mon Sep 17 00:00:00 2001
From: Evie Wright <evie.wright@arm.com>
Date: Tue, 4 Mar 2025 11:39:56 +0000
Subject: [PATCH 8/8] switch check for gemv kernel to use m_step in all cases

Signed-off-by: Evie Wright <evie.wright@arm.com>
---
 ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 16 +++++-----
 .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 32 +++++++++----------
 .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp | 16 +++++-----
 ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp |  8 ++---
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index 5925a18f..96f67ffe 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -173,7 +173,10 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -197,9 +200,6 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
         ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -311,7 +311,10 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -347,9 +350,6 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
         ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index 8c418a92..ea8eb715 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -291,7 +291,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -314,9 +317,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -419,7 +419,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -442,9 +445,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -550,7 +550,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -584,9 +587,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -682,7 +682,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -717,9 +720,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index 365da8e3..208c9a5c 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -112,7 +112,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -135,9 +138,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
@@ -230,7 +230,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     const auto kr = ukernel_variant.interface.get_kr();
     const auto sr = ukernel_variant.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
@@ -264,9 +267,6 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
         ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits<float>::lowest(),
         std::numeric_limits<float>::max());
 
-    auto m_step = ukernel_variant.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index 1236677f..17e4ba44 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -179,13 +179,13 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
     const auto kr = ukernel_variant.ukernel.interface.get_kr();
     const auto sr = ukernel_variant.ukernel.interface.get_sr();
 
-    if (mr == 1 && M > 1) {
+    const auto m_step = ukernel_variant.ukernel.interface.get_m_step();
+    ASSERT_TRUE(m_step % mr == 0);
+
+    if (m_step == 1 && M > 1) {
         GTEST_SKIP() << "Kernel does not support M != 1, but here M = " << M;
     }
 
-    auto m_step = ukernel_variant.ukernel.interface.get_m_step();
-    ASSERT_TRUE(m_step % mr == 0);
-
     auto n_step = ukernel_variant.ukernel.interface.get_n_step();
     ASSERT_TRUE(n_step % nr == 0);
 
-- 
GitLab