From f52eeedd1f94ddf861c84ce2535b60286eceb40e Mon Sep 17 00:00:00 2001
From: Jakub Sujak <jakub.sujak@arm.com>
Date: Fri, 25 Oct 2024 15:33:18 +0100
Subject: [PATCH 1/2] Add SME F32 GEMV pairing for the GEMM

Adds an SME F32 MatMul (1xN) micro-kernel that computes on the same packed RHS as the main SME F32 MatMul (MxN) micro-kernel. In other words, both the (1xN) and (MxN) micro-kernels share a RHS with the same packing parameters `nr` and `kr`, indicated by 2vlx1.

Having a (1xN) and (MxN) micro-kernel pairing is optimal to handle cases in AI frameworks where the LHS is dynamic (and as such the value of M can change) but where the RHS is shared so it only needs to be packed once.

Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
---
 CMakeLists.txt                                |   1 +
 kai/ukernels/matmul/BUILD.bazel               |   8 +
 ...clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c | 837 ++++++++++++++++++
 ...clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h | 116 +++
 test/tests/matmul_clamp_f32_f32_f32p_test.cpp | 124 ++-
 5 files changed, 1059 insertions(+), 27 deletions(-)
 create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c
 create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dd83d7d..34b345c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,6 +132,7 @@ set(KLEIDIAI_FILES_SME
 set(KLEIDIAI_FILES_SME2
     kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.c
     kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla.c
+    kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c
 )
 
 add_library(kleidiai)
diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel
index 0704a910..c2c8283a 100644
--- a/kai/ukernels/matmul/BUILD.bazel
+++ b/kai/ukernels/matmul/BUILD.bazel
@@ -86,6 +86,13 @@ kai_c_library(
     cpu_uarch = kai_cpu_sme(),
 )
 
+kai_c_library(
+    name = "clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla",
+    srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c"],
+    hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"],
+    cpu_uarch = kai_cpu_sme(),
+)
+
 cc_library(
     name = "clamp_f32_qai8dxp_qsi4cxp_interface",
     hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h"],
@@ -369,6 +376,7 @@ kai_c_library(
         ":clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla",
         ":clamp_f32_bf16p_bf16p_neon_mmla",
         ":clamp_f32_f32_f32p",
+        ":clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla",
         ":clamp_f32_f32_f32pb_1x16vl_sme2_mla",
         ":clamp_f32_f32p_f32p",
         ":clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod",
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c
new file mode 100644
index 00000000..8809e5a6
--- /dev/null
+++ b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c
@@ -0,0 +1,837 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#if !defined(__aarch64__) || !defined(__ARM_FEATURE_SVE2)
+#error This file must be compiled for AArch64, FEAT_SVE2.
+#else  // Architectural features check.
+
+#include "kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "kai/kai_common.h"
+
+static const size_t kai_m_step = 1;
+static const size_t kai_n_step = 16;
+static const size_t kai_nr = 2;
+static const size_t kai_kr = 1;
+static const size_t kai_sr = 1;
+
+size_t kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
+    return kai_m_step * kai_get_sme_vector_length_u32();
+}
+
+size_t kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
+    return kai_n_step * kai_get_sme_vector_length_u32();
+}
+
+size_t kai_get_nr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
+    return kai_nr * kai_get_sme_vector_length_u32();
+}
+
+size_t kai_get_kr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
+    return kai_kr;
+}
+
+size_t kai_get_sr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void) {
+    return kai_sr;
+}
+
+size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m_idx, size_t lhs_stride) {
+    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
+
+    return m_idx * lhs_stride;
+}
+
+size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t n_idx, size_t k) {
+    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
+    return n_idx * (k * sizeof(float) + sizeof(float));
+}
+
+size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
+    size_t m_idx, size_t n_idx, size_t dst_stride) {
+    KAI_ASSUME(m_idx % kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
+    KAI_ASSUME(n_idx % kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla() == 0);
+
+    return (m_idx * dst_stride) + (n_idx * sizeof(float));
+}
+
+size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m, size_t n) {
+    return m * n * sizeof(float);
+}
+
+void kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
+    size_t m, size_t n, size_t k, const void* lhs, size_t lhs_stride, const void* rhs_packed, void* dst,
+    size_t dst_stride_row, size_t dst_stride_col, float clamp_min, float clamp_max) {
+    KAI_UNUSED(lhs_stride);
+    KAI_UNUSED(dst_stride_row);
+    KAI_UNUSED(dst_stride_col);
+
+    KAI_ASSUME(m == 1);
+
+    typedef struct {
+        float maxval;
+        float minval;
+    } KernelArgs;
+
+    KernelArgs ka;
+    ka.maxval = clamp_max;
+    ka.minval = clamp_min;
+
+    size_t N = n;
+    size_t K = k;
+
+    const void* A_ptr = lhs;
+    const void* B_ptr = rhs_packed;
+    void* output_ptr = dst;
+
+    uint64_t flags = 0;
+
+    __asm__ __volatile__(
+        "ptrue p8.b\n"
+        ".inst 0xd503477f  // SMSTART ZA\n"
+        "mov x8, #0x0\n"
+        "cntw x16, ALL, MUL #4\n"
+        "mov x15, %x[B_ptr]\n"
+        "add x14, %x[N], x16\n"
+        "mov x13, %x[output_ptr]\n"
+        "sub x14, x14, #0x1\n"
+        "ptrue p1.b\n"
+        "udiv x14, x14, x16\n"
+        ".inst 0x25207811  // ptrue pn9.b\n"
+        "add x22, x14, #0x3\n"
+        "mov x21, #0x1\n"
+        "and x22, x22, #0xfffffffffffffffc\n"
+        "mul x22, x22, x16\n"
+        "mul x22, x22, %x[K]\n"
+        "lsl x22, x22, #0x2\n"
+        "1:"  // RHS size check loop
+        "cmp x22, #0x200000\n"
+        "blt 2f\n"
+        "tbnz x22, #0, 3f\n"
+        "lsr x22, x22, #0x1\n"
+        "lsl x21, x21, #0x1\n"
+        "b 1b\n"
+        "2:"  // RHS do prefetch
+        "lsl x20, x22, #0x26\n"
+        "sub x21, x21, #0x1\n"
+        "lsl x21, x21, #0x16\n"
+        "orr x22, x22, x20\n"
+        "orr x22, x22, x21\n"
+        ".inst 0xf8b649fa  // rprfm pldonce, x22, [x15]\n"
+        "3:"  // RHS prefetch exit
+        "mov x12, %x[K]\n"
+        "cntw x20, ALL, MUL #2\n"
+        "lsl x12, x12, #0x2\n"
+        "add x12, x12, #0x4\n"
+        "mul x12, x12, x20\n"
+        "4:"  // Column loop
+        "cmp x14, #0x4\n"
+        "bge 22f\n"
+        "cmp x14, #0x2\n"
+        "bgt 16f\n"
+        "beq 10f\n"
+        "cntw x20, ALL, MUL #2\n"
+        "add x22, x15, x12\n"
+        ".inst 0xa04045f4  // ld1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "cmp %x[N], x20\n"
+        "mov x11, %x[K]\n"
+        "csel x22, x22, x15, GT\n"
+        "mov x21, %x[N]\n"
+        ".inst 0xa04046d6  // ld1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
+        "mov x10, %x[A_ptr]\n"
+        "lsl x20, %x[K], #0x2\n"
+        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
+        "addvl x15, x15, #2\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
+        "ble 6f\n"
+        "5:"  // Width 1: Multiply loop: Main loop head
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        "ld1rqw { z15.s }, p0/Z, [x10]\n"
+        "sub x11, x11, #0x4\n"
+        "add x10, x10, #0x10\n"
+        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc15f8080  // fmla za.s[x8, 0], { z4.s-z7.s }, z15.s[0]\n"
+        ".inst 0xa04045e1  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046c3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa04045f5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc15f8780  // fmla za.s[x8, 0], { z28.s-z31.s }, z15.s[1]\n"
+        ".inst 0xc15f8800  // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s[2]\n"
+        ".inst 0xc15f8e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s[3]\n"
+        "bgt 5b\n"
+        "6:"  // Width 1: Multiply loop: Single iteration only
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e1  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "ld1rqw { z8.s }, p0/Z, [x10]\n"
+        "add x10, x10, #0x10\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046c3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1588000  // fmla za.s[x8, 0], { z0.s-z3.s }, z8.s[0]\n"
+        "ble 7f\n"
+        ".inst 0xa04045f1  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1588600  // fmla za.s[x8, 0], { z16.s-z19.s }, z8.s[1]\n"
+        "ble 7f\n"
+        ".inst 0xa04045f5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1588a80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s[2]\n"
+        "ble 7f\n"
+        ".inst 0xa04045ed  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1588d80  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[3]\n"
+        "7:"  // Width 1: Multiply loop: multiply skip
+        "tbz %x[flags], #1, 8f\n"
+        "add x21, %x[args_ptr], %[offset_min]\n"
+        "add x20, %x[args_ptr], %[offset_max]\n"
+        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+        "ld1rw { z21.s }, p1/Z, [x21]\n"
+        "ld1rw { z29.s }, p1/Z, [x20]\n"
+        ".inst 0xc1bdcaa8  // fclamp { z8.s-z11.s }, z21.s, z29.s\n"
+        ".inst 0xa060c1a8  // st1w { z8.s-z11.s }, p8, [x13]\n"
+        "addvl x13, x13, #4\n"
+        "b 9f\n"
+        "8:"  // Width 1: No activation
+        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+        ".inst 0xa060c1a8  // st1w { z8.s-z11.s }, p8, [x13]\n"
+        "addvl x13, x13, #4\n"
+        "9:"  // Width 1: Output done
+        "b 28f\n"
+        "10:"  // Width 2
+        "add x24, x15, x12, LSL #1\n"
+        "cntw x20, ALL, MUL #6\n"
+        ".inst 0xa04045e4  // ld1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "add x23, x24, x12\n"
+        "cmp %x[N], x20\n"
+        ".inst 0xa0404700  // ld1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
+        "add x22, x15, x12\n"
+        "csel x23, x23, x15, GT\n"
+        ".inst 0xa04046c6  // ld1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
+        "mov x11, %x[K]\n"
+        "sub x21, %x[N], x16\n"
+        ".inst 0xa04046e2  // ld1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
+        "mov x10, %x[A_ptr]\n"
+        "lsl x20, %x[K], #0x2\n"
+        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xc0040c80  // mova za.d[x8, #0], { z4.d-z7.d }\n"
+        "addvl x22, x22, #2\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc0040c01  // mova za.d[x8, #1], { z0.d-z3.d }\n"
+        "addvl x23, x23, #2\n"
+        "ble 12f\n"
+        "11:"  // Width 2: Multiply loop: Main loop head
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        "ld1rqw { z0.s }, p0/Z, [x10]\n"
+        "sub x11, x11, #0x4\n"
+        "add x10, x10, #0x10\n"
+        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xa0404715  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1508080  // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s[0]\n"
+        ".inst 0xa04045f9  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1508281  // fmla za.s[x8, 1], { z20.s-z23.s }, z0.s[0]\n"
+        ".inst 0xa0404709  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1508700  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s[1]\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1508501  // fmla za.s[x8, 1], { z8.s-z11.s }, z0.s[1]\n"
+        ".inst 0xa0404709  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1508b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z0.s[2]\n"
+        ".inst 0xa04045f9  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xc1508901  // fmla za.s[x8, 1], { z8.s-z11.s }, z0.s[2]\n"
+        ".inst 0xa040470d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1508f00  // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s[3]\n"
+        ".inst 0xc1508d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s[3]\n"
+        "bgt 11b\n"
+        "12:"  // Width 2: Multiply loop: Single iteration only
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "ld1rqw { z8.s }, p0/Z, [x10]\n"
+        "add x10, x10, #0x10\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404715  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
+        ".inst 0xc1588281  // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s[0]\n"
+        "ble 13f\n"
+        ".inst 0xa04045ed  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa040471d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046ff  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1588580  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[1]\n"
+        ".inst 0xc1588781  // fmla za.s[x8, 1], { z28.s-z31.s }, z8.s[1]\n"
+        "ble 13f\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404701  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1588b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[2]\n"
+        ".inst 0xc1588801  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[2]\n"
+        "ble 13f\n"
+        ".inst 0xa04045f5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046d7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404701  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1588e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s[3]\n"
+        ".inst 0xc1588c01  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[3]\n"
+        "13:"  // Width 2: Multiply loop: multiply skip
+        "tbz %x[flags], #1, 14f\n"
+        "add x21, %x[args_ptr], %[offset_min]\n"
+        "add x20, %x[args_ptr], %[offset_max]\n"
+        ".inst 0xc0060c1c  // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c24  // mova { z4.d-z7.d }, za.d[x8, #1]\n"
+        "ld1rw { z17.s }, p1/Z, [x21]\n"
+        "ld1rw { z9.s }, p1/Z, [x20]\n"
+        ".inst 0xc1a9ca3c  // fclamp { z28.s-z31.s }, z17.s, z9.s\n"
+        ".inst 0xc1a9ca24  // fclamp { z4.s-z7.s }, z17.s, z9.s\n"
+        ".inst 0xa060c5bc  // st1w { z28.s-z31.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c1a4  // st1w { z4.s-z7.s }, p8, [x13, #0x4, MUL VL]\n"
+        "addvl x13, x13, #8\n"
+        "b 15f\n"
+        "14:"  // Width 2: No activation
+        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c3c  // mova { z28.d-z31.d }, za.d[x8, #1]\n"
+        ".inst 0xa060c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c1bc  // st1w { z28.s-z31.s }, p8, [x13, #0x4, MUL VL]\n"
+        "addvl x13, x13, #8\n"
+        "15:"  // Width 2: Output done
+        "b 28f\n"
+        "16:"  // Width 3
+        "add x26, x15, x12, LSL #2\n"
+        "cntw x20, ALL, MUL #10\n"
+        ".inst 0xa04045f4  // ld1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "add x25, x15, x12, LSL #1\n"
+        "add x24, x26, x12\n"
+        ".inst 0xa0404740  // ld1w { z0.s-z1.s }, pn9.b/Z, [x26]\n"
+        "cmp %x[N], x20\n"
+        "add x23, x15, x12\n"
+        ".inst 0xa0404730  // ld1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
+        "add x22, x25, x12\n"
+        "csel x24, x24, x15, GT\n"
+        ".inst 0xa04046f6  // ld1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
+        "mov x20, #0x2\n"
+        ".inst 0xa04046d2  // ld1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
+        "mov x11, %x[K]\n"
+        ".inst 0xa0404702  // ld1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "msub x21, x16, x20, %x[N]\n"
+        "mov x10, %x[A_ptr]\n"
+        "lsl x20, %x[K], #0x2\n"
+        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
+        ".inst 0xc0040e80  // mova za.d[x8, #0], { z20.d-z23.d }\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
+        ".inst 0xc0040e01  // mova za.d[x8, #1], { z16.d-z19.d }\n"
+        "addvl x15, x15, #2\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc0040c02  // mova za.d[x8, #2], { z0.d-z3.d }\n"
+        "addvl x25, x25, #2\n"
+        "addvl x22, x22, #2\n"
+        "addvl x26, x26, #2\n"
+        "addvl x24, x24, #2\n"
+        "ble 18f\n"
+        "17:"  // Width 3: Multiply loop: Main loop head
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045ed  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        "ld1rqw { z3.s }, p0/Z, [x10]\n"
+        "sub x11, x11, #0x4\n"
+        "add x10, x10, #0x10\n"
+        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xa0404729  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046cb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1538180  // fmla za.s[x8, 0], { z12.s-z15.s }, z3.s[0]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1538101  // fmla za.s[x8, 1], { z8.s-z11.s }, z3.s[0]\n"
+        ".inst 0xa04045e9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1538202  // fmla za.s[x8, 2], { z16.s-z19.s }, z3.s[0]\n"
+        ".inst 0xa0404731  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404745  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1538500  // fmla za.s[x8, 0], { z8.s-z11.s }, z3.s[1]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1538601  // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s[1]\n"
+        ".inst 0xa04045e9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1538482  // fmla za.s[x8, 2], { z4.s-z7.s }, z3.s[1]\n"
+        ".inst 0xa0404731  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046d3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404745  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1538900  // fmla za.s[x8, 0], { z8.s-z11.s }, z3.s[2]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1538a01  // fmla za.s[x8, 1], { z16.s-z19.s }, z3.s[2]\n"
+        ".inst 0xa04045f5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xc1538882  // fmla za.s[x8, 2], { z4.s-z7.s }, z3.s[2]\n"
+        ".inst 0xa0404739  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046db  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1538e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s[3]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1538f01  // fmla za.s[x8, 1], { z24.s-z27.s }, z3.s[3]\n"
+        ".inst 0xc1538e02  // fmla za.s[x8, 2], { z16.s-z19.s }, z3.s[3]\n"
+        "bgt 17b\n"
+        "18:"  // Width 3: Multiply loop: Single iteration only
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "ld1rqw { z8.s }, p0/Z, [x10]\n"
+        "add x10, x10, #0x10\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa040473d  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046df  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404755  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404717  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1588381  // fmla za.s[x8, 1], { z28.s-z31.s }, z8.s[0]\n"
+        ".inst 0xc1588282  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[0]\n"
+        "ble 19f\n"
+        ".inst 0xa04045ed  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404725  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046c7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1588580  // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s[1]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1588481  // fmla za.s[x8, 1], { z4.s-z7.s }, z8.s[1]\n"
+        ".inst 0xc1588602  // fmla za.s[x8, 2], { z16.s-z19.s }, z8.s[1]\n"
+        "ble 19f\n"
+        ".inst 0xa04045e1  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa040472d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404751  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1588800  // fmla za.s[x8, 0], { z0.s-z3.s }, z8.s[2]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404713  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1588981  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[2]\n"
+        ".inst 0xc1588a02  // fmla za.s[x8, 2], { z16.s-z19.s }, z8.s[2]\n"
+        "ble 19f\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa040472d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa04046cf  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x22]\n"
+        "addvl x22, x22, #2\n"
+        ".inst 0xa0404755  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x26]\n"
+        ".inst 0xc1588c80  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[3]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xa0404717  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xc1588d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[3]\n"
+        ".inst 0xc1588e82  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[3]\n"
+        "19:"  // Width 3: Multiply loop: multiply skip
+        "tbz %x[flags], #1, 20f\n"
+        "add x21, %x[args_ptr], %[offset_min]\n"
+        "add x20, %x[args_ptr], %[offset_max]\n"
+        ".inst 0xc0060c08  // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c2c  // mova { z12.d-z15.d }, za.d[x8, #1]\n"
+        "ld1rw { z21.s }, p1/Z, [x21]\n"
+        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
+        "ld1rw { z20.s }, p1/Z, [x20]\n"
+        ".inst 0xc1b4caa8  // fclamp { z8.s-z11.s }, z21.s, z20.s\n"
+        ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+        ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+        ".inst 0xa060c5a8  // st1w { z8.s-z11.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+        ".inst 0xa062c1b0  // st1w { z16.s-z19.s }, p8, [x13, #0x8, MUL VL]\n"
+        "addvl x13, x13, #12\n"
+        "b 21f\n"
+        "20:"  // Width 3: No activation
+        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
+        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
+        ".inst 0xa060c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+        ".inst 0xa062c1b0  // st1w { z16.s-z19.s }, p8, [x13, #0x8, MUL VL]\n"
+        "addvl x13, x13, #12\n"
+        "21:"  // Width 3: Output done
+        "b 28f\n"
+        "22:"  // Width 4
+        "add x9, x15, x12, LSL #2\n"
+        "cntw x20, ALL, MUL #14\n"
+        ".inst 0xa04045ec  // ld1w { z12.s-z13.s }, pn9.b/Z, [x15]\n"
+        "add x28, x9, x12, LSL #1\n"
+        "add x27, x15, x12, LSL #1\n"
+        ".inst 0xa0404528  // ld1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
+        "add x26, x28, x12\n"
+        "cmp %x[N], x20\n"
+        ".inst 0xa0404760  // ld1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
+        "add x25, x15, x12\n"
+        "add x24, x27, x12\n"
+        ".inst 0xa0404790  // ld1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        "add x23, x9, x12\n"
+        "csel x26, x26, x15, GT\n"
+        ".inst 0xa040472e  // ld1w { z14.s-z15.s }, pn9.b/Z, [x25]\n"
+        "mov x20, #0x3\n"
+        ".inst 0xa0404702  // ld1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "mov x11, %x[K]\n"
+        ".inst 0xa04046ea  // ld1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "msub x21, x16, x20, %x[N]\n"
+        "mov x10, %x[A_ptr]\n"
+        ".inst 0xa0404752  // ld1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "lsl x20, %x[K], #0x2\n"
+        ".inst 0x25b567f0  // whilelt p8.s, XZR, x21, VLx4\n"
+        ".inst 0xc0040d80  // mova za.d[x8, #0], { z12.d-z15.d }\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xf8b44958  // rprfm pldmany, x20, [x10]\n"
+        ".inst 0xc0040c01  // mova za.d[x8, #1], { z0.d-z3.d }\n"
+        "add x22, x15, x12, LSL #3\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xc0040d02  // mova za.d[x8, #2], { z8.d-z11.d }\n"
+        "addvl x25, x25, #2\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xc0040e03  // mova za.d[x8, #3], { z16.d-z19.d }\n"
+        "addvl x24, x24, #2\n"
+        "addvl x9, x9, #2\n"
+        "addvl x23, x23, #2\n"
+        "addvl x28, x28, #2\n"
+        "addvl x26, x26, #2\n"
+        "ble 24f\n"
+        "23:"  // Width 4: Multiply loop: Main loop head
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e9  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        "ld1rqw { z13.s }, p0/Z, [x10]\n"
+        "sub x11, x11, #0x4\n"
+        "add x10, x10, #0x10\n"
+        ".inst 0xa040472b  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        "cmp x11, #0x4\n"
+        ".inst 0xa0404765  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa0404707  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404531  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc15d8100  // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s[0]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046f3  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404781  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc15d8081  // fmla za.s[x8, 1], { z4.s-z7.s }, z13.s[0]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404743  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc15d8202  // fmla za.s[x8, 2], { z16.s-z19.s }, z13.s[0]\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xc15d8003  // fmla za.s[x8, 3], { z0.s-z3.s }, z13.s[0]\n"
+        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc15d8780  // fmla za.s[x8, 0], { z28.s-z31.s }, z13.s[1]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc15d8401  // fmla za.s[x8, 1], { z0.s-z3.s }, z13.s[1]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc15d8502  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[1]\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xc15d8603  // fmla za.s[x8, 3], { z16.s-z19.s }, z13.s[1]\n"
+        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc15d8880  // fmla za.s[x8, 0], { z4.s-z7.s }, z13.s[2]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc15d8801  // fmla za.s[x8, 1], { z0.s-z3.s }, z13.s[2]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc15d8902  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[2]\n"
+        ".inst 0xa04045f5  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa0404737  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xc15d8a03  // fmla za.s[x8, 3], { z16.s-z19.s }, z13.s[2]\n"
+        ".inst 0xa0404779  // ldnt1w { z24.s-z25.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa040471b  // ldnt1w { z26.s-z27.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404529  // ldnt1w { z8.s-z9.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc15d8e80  // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s[3]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046eb  // ldnt1w { z10.s-z11.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404795  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc15d8f01  // fmla za.s[x8, 1], { z24.s-z27.s }, z13.s[3]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404757  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc15d8d02  // fmla za.s[x8, 2], { z8.s-z11.s }, z13.s[3]\n"
+        ".inst 0xc15d8e83  // fmla za.s[x8, 3], { z20.s-z23.s }, z13.s[3]\n"
+        "bgt 23b\n"
+        "24:"  // Width 4: Multiply loop: Single iteration only
+        "whilelt p0.s, XZR, x11\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "ld1rqw { z8.s }, p0/Z, [x10]\n"
+        "add x10, x10, #0x10\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa040452d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc1588080  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[0]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046ef  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc1588001  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[0]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc1588182  // fmla za.s[x8, 2], { z12.s-z15.s }, z8.s[0]\n"
+        ".inst 0xc1588203  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[0]\n"
+        "ble 25f\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa0404761  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa0404703  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404525  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc1588780  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[1]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046e7  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc1588401  // fmla za.s[x8, 1], { z0.s-z3.s }, z8.s[1]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc1588482  // fmla za.s[x8, 2], { z4.s-z7.s }, z8.s[1]\n"
+        ".inst 0xc1588603  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[1]\n"
+        "ble 25f\n"
+        ".inst 0xa04045fd  // ldnt1w { z28.s-z29.s }, pn9.b/Z, [x15]\n"
+        "subs x11, x11, #0x1\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa040473f  // ldnt1w { z30.s-z31.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa040476d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa040470f  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404521  // ldnt1w { z0.s-z1.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc1588b80  // fmla za.s[x8, 0], { z28.s-z31.s }, z8.s[2]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046e3  // ldnt1w { z2.s-z3.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc1588981  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[2]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc1588802  // fmla za.s[x8, 2], { z0.s-z3.s }, z8.s[2]\n"
+        ".inst 0xc1588a03  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[2]\n"
+        "ble 25f\n"
+        ".inst 0xa04045e5  // ldnt1w { z4.s-z5.s }, pn9.b/Z, [x15]\n"
+        "addvl x15, x15, #2\n"
+        ".inst 0xa0404727  // ldnt1w { z6.s-z7.s }, pn9.b/Z, [x25]\n"
+        "addvl x25, x25, #2\n"
+        ".inst 0xa040476d  // ldnt1w { z12.s-z13.s }, pn9.b/Z, [x27]\n"
+        "addvl x27, x27, #2\n"
+        ".inst 0xa040470f  // ldnt1w { z14.s-z15.s }, pn9.b/Z, [x24]\n"
+        "addvl x24, x24, #2\n"
+        ".inst 0xa0404535  // ldnt1w { z20.s-z21.s }, pn9.b/Z, [x9]\n"
+        ".inst 0xc1588c80  // fmla za.s[x8, 0], { z4.s-z7.s }, z8.s[3]\n"
+        "addvl x9, x9, #2\n"
+        ".inst 0xa04046f7  // ldnt1w { z22.s-z23.s }, pn9.b/Z, [x23]\n"
+        "addvl x23, x23, #2\n"
+        ".inst 0xa0404791  // ldnt1w { z16.s-z17.s }, pn9.b/Z, [x28]\n"
+        ".inst 0xc1588d81  // fmla za.s[x8, 1], { z12.s-z15.s }, z8.s[3]\n"
+        "addvl x28, x28, #2\n"
+        ".inst 0xa0404753  // ldnt1w { z18.s-z19.s }, pn9.b/Z, [x26]\n"
+        "addvl x26, x26, #2\n"
+        ".inst 0xc1588e82  // fmla za.s[x8, 2], { z20.s-z23.s }, z8.s[3]\n"
+        ".inst 0xc1588e03  // fmla za.s[x8, 3], { z16.s-z19.s }, z8.s[3]\n"
+        "25:"  // Width 4: Multiply loop: multiply skip
+        "tbz %x[flags], #1, 26f\n"
+        "add x21, %x[args_ptr], %[offset_min]\n"
+        "add x20, %x[args_ptr], %[offset_max]\n"
+        ".inst 0xc0060c04  // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
+        "ld1rw { z21.s }, p1/Z, [x21]\n"
+        ".inst 0xc0060c4c  // mova { z12.d-z15.d }, za.d[x8, #2]\n"
+        "ld1rw { z20.s }, p1/Z, [x20]\n"
+        ".inst 0xc0060c70  // mova { z16.d-z19.d }, za.d[x8, #3]\n"
+        ".inst 0xc1b4caa4  // fclamp { z4.s-z7.s }, z21.s, z20.s\n"
+        ".inst 0xc1b4caa0  // fclamp { z0.s-z3.s }, z21.s, z20.s\n"
+        ".inst 0xc1b4caac  // fclamp { z12.s-z15.s }, z21.s, z20.s\n"
+        ".inst 0xc1b4cab0  // fclamp { z16.s-z19.s }, z21.s, z20.s\n"
+        ".inst 0xa060c5a4  // st1w { z4.s-z7.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+        ".inst 0xa062c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+        ".inst 0xa063c1b0  // st1w { z16.s-z19.s }, p8, [x13, #0xc, MUL VL]\n"
+        "addvl x13, x13, #16\n"
+        "b 27f\n"
+        "26:"  // Width 4: No activation
+        ".inst 0xc0060c0c  // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+        ".inst 0xc0060c20  // mova { z0.d-z3.d }, za.d[x8, #1]\n"
+        ".inst 0xc0060c50  // mova { z16.d-z19.d }, za.d[x8, #2]\n"
+        ".inst 0xc0060c64  // mova { z4.d-z7.d }, za.d[x8, #3]\n"
+        ".inst 0xa060c5ac  // st1w { z12.s-z15.s }, pn9.b, [x13]\n"
+        ".inst 0xa061c5a0  // st1w { z0.s-z3.s }, pn9.b, [x13, #0x4, MUL VL]\n"
+        ".inst 0xa062c5b0  // st1w { z16.s-z19.s }, pn9.b, [x13, #0x8, MUL VL]\n"
+        ".inst 0xa063c1a4  // st1w { z4.s-z7.s }, p8, [x13, #0xc, MUL VL]\n"
+        "addvl x13, x13, #16\n"
+        "27:"  // Width 4: Output done
+        "subs x14, x14, #0x4\n"
+        "mov x15, x22\n"
+        "sub %x[N], %x[N], x16, LSL #2\n"
+        "bgt 4b\n"
+        "28:"  // Exit
+        ".inst 0xd503467f  // SMSTOP\n"
+        "ptrue p8.b\n"
+        : [N] "+&r"(N)
+        : [A_ptr] "r"(A_ptr), [B_ptr] "r"(B_ptr), [K] "r"(K), [args_ptr] "r"(&ka), [flags] "r"(flags),
+          [offset_max] "I"(offsetof(KernelArgs, maxval)), [offset_min] "I"(offsetof(KernelArgs, minval)),
+          [output_ptr] "r"(output_ptr)
+        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14",
+          "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25",
+          "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13",
+          "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28",
+          "z29", "z30", "z31");
+}
+
+#endif  // Architectural features check.
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h
new file mode 100644
index 00000000..62f0d159
--- /dev/null
+++ b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h
@@ -0,0 +1,116 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// Micro-kernel dependencies
+///
+/// -# kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme to pack the RHS matrix.
+
+/// --------------------------------------------------
+
+/// Gets m step value.
+///
+/// The starting row index must be divisible by `m_step`.
+///
+/// @return The m step value.
+size_t kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void);
+
+/// Gets n step value.
+///
+/// The starting column index must be divisible by `n_step`.
+///
+/// @return The n step value.
+size_t kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void);
+
+/// Gets nr value.
+///
+/// This is the packing parameter which must be used to pack the RHS matrix.
+///
+/// @return The nr value.
+size_t kai_get_nr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void);
+
+/// Gets kr value.
+///
+/// This is the packing parameter which must be used to pack the RHS matrix.
+///
+/// @return The kr value.
+size_t kai_get_kr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void);
+
+/// Gets sr value.
+///
+/// This is the packing parameter which must be used to pack the RHS matrix.
+///
+/// @return The sr value.
+size_t kai_get_sr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(void);
+
+/// Gets the offset in bytes to the data element in the LHS matrix buffer.
+///
+/// @param[in] m_idx Row index.
+/// @param[in] lhs_stride Row stride in bytes.
+///
+/// @return The offset in bytes to the data element.
+size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m_idx, size_t lhs_stride);
+
+/// Gets the offset in bytes to the data element in the packed RHS matrix buffer.
+///
+/// @param[in] n_idx Column index in the unpacked RHS matrix.
+/// @param[in] k Number of rows in the unpacked RHS matrix.
+///
+/// @return The offset in bytes to the data element.
+size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t n_idx, size_t k);
+
+/// Gets the offset in bytes to the data element in the destination matrix buffer.
+///
+/// @param[in] m_idx Row index.
+/// @param[in] n_idx Column index.
+/// @param[in] dst_stride Row stride in bytes.
+///
+/// @return The offset in bytes to the data element.
+size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
+    size_t m_idx, size_t n_idx, size_t dst_stride);
+
+/// Gets the size in bytes of the destination matrix buffer.
+///
+/// @param[in] m Number of rows.
+/// @param[in] n Number of columns.
+///
+/// @return The size in bytes of the destination matrix buffer.
+size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(size_t m, size_t n);
+
+/// Runs the matrix multiplication microkernel followed by a clamp operation.
+///
+/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset
+/// calculated using the following functions:
+///
+///   * LHS: @ref kai_get_lhs_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.
+///   * Packed RHS: @ref kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.
+///   * Output: @ref kai_get_dst_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.
+///
+/// @param[in]  m Number of output rows to be computed. This must be 1.
+/// @param[in]  n Number of output columns to be computed.
+/// @param[in]  k Common dimension of the LHS and RHS operand.
+/// @param[in]  lhs LHS matrix buffer.
+/// @param[in]  lhs_stride Row stride in bytes of the LHS matrix. Currently, an unused parameter.
+/// @param[in]  rhs_packed Packed RHS matrix buffer.
+/// @param[out] dst Output matrix buffer.
+/// @param[in]  dst_stride_row Row stride in bytes of the output matrix. Currently, an unused parameter.
+/// @param[in]  dst_stride_col Column stride in bytes of the output matrix. Currently, an unused parameter.
+/// @param[in]  clamp_min Minimum value to clamp the final result.
+/// @param[in]  clamp_max Maximum value to clamp the final result.
+void kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla(
+    size_t m, size_t n, size_t k, const void* lhs, size_t lhs_stride, const void* rhs_packed, void* dst,
+    size_t dst_stride_row, size_t dst_stride_col, float clamp_min, float clamp_max);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
index 3146a5b5..e95513cf 100644
--- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
@@ -6,14 +6,20 @@
 
 #include <gtest/gtest.h>
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <limits>
+#include <memory>
 #include <sstream>
 #include <vector>
 
+#include "kai/kai_common.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p_interface.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla.h"
+#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32pb_f32_f32_16vlx1_sme.h"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_type.hpp"
@@ -24,22 +30,57 @@
 
 namespace kai::test {
 
-class kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla : public ::testing::TestWithParam<MatMulShape> {};
-
-TEST_P(kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla, EndToEnd) {
-    if (!cpu_has_sme2()) {
+namespace {
+const std::array<UkernelVariant<kai_matmul_clamp_f32_f32_f32p_ukernel>, 2> ukernel_variants = {
+    {{
+         {kai_get_m_step_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_n_step_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_nr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla, kai_get_kr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_sr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_lhs_offset_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_dst_offset_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_get_dst_size_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
+          kai_run_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla},
+         "matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla",
+         cpu_has_sme2,
+     },
+     {{kai_get_m_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_n_step_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_nr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_kr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_sr_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_lhs_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_dst_offset_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_get_dst_size_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla,
+       kai_run_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla},
+      "matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla",
+      cpu_has_sme2}}};
+}  // namespace
+
+class MatMulTest_f32_f32_f32p : public ::testing::TestWithParam<MatMulTestParams> {};
+
+TEST_P(MatMulTest_f32_f32_f32p, EndToEnd)  // NOLINT(google-readability-avoid-underscore-in-googletest-name)
+{
+    const auto& [variant_idx, matmul_shape] = GetParam();
+    const auto& ukernel_variant = ukernel_variants.at(variant_idx);
+
+    if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) {
         GTEST_SKIP();
     }
 
-    const std::uint64_t seed = 0;
+    constexpr uint64_t seed = 0;
 
-    const auto& [m, n, k] = GetParam();
+    const size_t m = matmul_shape.m;
+    const size_t n = matmul_shape.n;
+    const size_t k = matmul_shape.k;
 
     GTEST_ASSERT_EQ(m, 1);
 
-    const auto nr = kai_get_nr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla();
-    const auto kr = kai_get_kr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla();
-    const auto sr = kai_get_sr_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla();
+    const auto nr = ukernel_variant.interface.get_nr();
+    const auto kr = ukernel_variant.interface.get_kr();
+    const auto sr = ukernel_variant.interface.get_sr();
 
     // Generates input data.
     const auto ref_lhs = fill_random<float>(m * k, seed + 0);
@@ -52,22 +93,40 @@ TEST_P(kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla, EndToEnd) {
         ref_bias.data(), nullptr, nullptr, DataType::FP32, DataType::FP32, m, n, k, false, false);
 
     // Run the RHS packing micro-kernel.
-    const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32pb_f32_f32_16vlx1_sme(n, k);
-    std::vector<float> imp_packed_rhs(imp_packed_rhs_size);
-    kai_run_rhs_pack_kxn_f32pb_f32_f32_16vlx1_sme(
-        1, n, k, nr, kr, sr, n * sizeof(float), ref_rhs.data(), ref_bias.data(), nullptr, imp_packed_rhs.data(), 0,
-        nullptr);
-
-    // Runs the GEMV micro-kernel.
-    const auto imp_dst_size = kai_get_dst_size_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla(m, n);
+    const auto rhs_stride = n * sizeof(float);
+
+    size_t imp_packed_rhs_size = 0;
+    std::unique_ptr<std::vector<float>> imp_packed_rhs;
+
+    switch (variant_idx) {
+        case 0:  // matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla
+            imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32pb_f32_f32_16vlx1_sme(n, k);
+            imp_packed_rhs = std::make_unique<std::vector<float>>(imp_packed_rhs_size);
+            kai_run_rhs_pack_kxn_f32pb_f32_f32_16vlx1_sme(
+                1, n, k, nr, kr, sr, rhs_stride, ref_rhs.data(), ref_bias.data(), nullptr, imp_packed_rhs->data(), 0,
+                nullptr);
+            break;
+        case 1:  // matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla
+            imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(n, k);
+            imp_packed_rhs = std::make_unique<std::vector<float>>(imp_packed_rhs_size);
+            kai_run_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme(
+                1, n, k, nr, kr, sr, rhs_stride, ref_rhs.data(), ref_bias.data(), nullptr, imp_packed_rhs->data(), 0,
+                nullptr);
+            break;
+        default:
+            KAI_ERROR("Unsupported micro-kernel");
+    }
+
+    // Run the MatMul micro-kernel.
+    const auto imp_dst_size = ukernel_variant.interface.get_dst_size(m, n);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
 
     std::vector<uint8_t> imp_dst(imp_dst_size);
-    kai_run_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla(
-        m, n, k, ref_lhs.data(), 1, imp_packed_rhs.data(), reinterpret_cast<float*>(imp_dst.data()), 1, 1,
+    ukernel_variant.interface.run_matmul(
+        m, n, k, ref_lhs.data(), 1, imp_packed_rhs->data(), reinterpret_cast<float*>(imp_dst.data()), 1, 1,
         std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max());
 
-    // Compares the output of the micro-kernels against the output of the reference implementation.
+    // Compare the output of the micro-kernels against the output of the reference implementation.
     for (size_t y = 0; y < m; ++y) {
         for (size_t x = 0; x < n; ++x) {
             const auto imp_value = read_array<float>(imp_dst.data(), (y * n) + x);
@@ -82,14 +141,25 @@ TEST_P(kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla, EndToEnd) {
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    MatMul, kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla,
-    testing::Values(
-        MatMulShape{1, 1, 1}, MatMulShape{1, 16, 1}, MatMulShape{1, 32, 64}, MatMulShape{1, 7, 74},
-        MatMulShape{1, 800, 64}, MatMulShape{1, 512, 130}),
-    [](const testing::TestParamInfo<kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla::ParamType>& info) {
+    MatMul, MatMulTest_f32_f32_f32p,
+    testing::Combine(
+        testing::Range<size_t>(0, ukernel_variants.size()),
+        testing::Values(
+            MatMulShape{1, 1, 1},     //
+            MatMulShape{1, 16, 1},    //
+            MatMulShape{1, 32, 64},   //
+            MatMulShape{1, 7, 74},    //
+            MatMulShape{1, 800, 64},  //
+            MatMulShape{1, 512, 130}  //
+            )),
+    [](const testing::TestParamInfo<MatMulTest_f32_f32_f32p::ParamType>& info) {
+        const uint8_t variant_idx = std::get<0>(info.param);
+        const MatMulShape matmul_shape = std::get<1>(info.param);
+
         std::stringstream sstream;
-        sstream << "kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla_"
-                << "_m_" << info.param.m << "_n_" << info.param.n << "_k_" << info.param.k;
+        sstream << ukernel_variants[variant_idx].name << "_m_" << matmul_shape.m << "_n_" << matmul_shape.n << "_k_"
+                << matmul_shape.k;
+
         return sstream.str();
     });
 
-- 
GitLab


From 77ab79a11ee7e952bce139864cbd2a0716102378 Mon Sep 17 00:00:00 2001
From: Jakub Sujak <jakub.sujak@arm.com>
Date: Fri, 25 Oct 2024 16:39:49 +0100
Subject: [PATCH 2/2] Add interface file as a Bazel dependency

Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
---
 kai/ukernels/matmul/BUILD.bazel | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel
index c2c8283a..bb047b41 100644
--- a/kai/ukernels/matmul/BUILD.bazel
+++ b/kai/ukernels/matmul/BUILD.bazel
@@ -65,11 +65,19 @@ kai_c_library(
     ],
 )
 
+cc_library(
+    name = "clamp_f32_f32_f32p_interface",
+    hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p_interface.h"],
+)
+
 kai_c_library(
     name = "clamp_f32_f32_f32p",
     srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c"],
     hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"],
     cpu_uarch = kai_cpu_neon(),
+    deps = [
+        ":clamp_f32_f32_f32p_interface",
+    ],
 )
 
 kai_c_library(
@@ -84,6 +92,9 @@ kai_c_library(
     srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla.c"],
     hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32pb_1x16vl_sme2_mla.h"],
     cpu_uarch = kai_cpu_sme(),
+    deps = [
+        ":clamp_f32_f32_f32p_interface",
+    ],
 )
 
 kai_c_library(
@@ -91,6 +102,9 @@ kai_c_library(
     srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.c"],
     hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla.h"],
     cpu_uarch = kai_cpu_sme(),
+    deps = [
+        ":clamp_f32_f32_f32p_interface",
+    ],
 )
 
 cc_library(
-- 
GitLab