From 1a087907afc500c4ae7eef8f2ff0a0029d802625 Mon Sep 17 00:00:00 2001
From: Jakub Sujak <jakub.sujak@arm.com>
Date: Sun, 22 Sep 2024 23:31:35 +0100
Subject: [PATCH] Revert "Add FP32 GEMV micro kernel"

This reverts commit f4f59599

The existing FP32 GEMM micro-kernel (matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla) has a dedicated path for M=1 (a "GEMV" operation).

Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
---
 CHANGELOG.md                                  |   2 +-
 CMakeLists.txt                                |   1 -
 kai/ukernels/matmul/BUILD.bazel               |  10 +-
 ...mp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c | 315 ------------------
 ...mp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h | 124 -------
 test/tests/matmul_test.cpp                    |  51 ---
 6 files changed, 3 insertions(+), 500 deletions(-)
 delete mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c
 delete mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5428c542..4831fd13 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo
 
 ## v0.3.0 - Upcoming Release
 
-- Advanced SIMD FP32 GEMM and GEMV micro kernels
+- Advanced SIMD FP32 GEMM micro-kernel.
 - Micro-kernels to compute the matrix multiplication of dynamically quantized asymmetric signed 8-bit integer with per-row quantization (QAI8DX) LHS and quantized symmetric 4-bit signed integer with per-block quantization (QSI4C32) RHS. The destination matrix data type is single-precision floating-point (F32). The micro-kernels have been optimized using the Arm® CPU feature FEAT_I8MM for the matrix-by-matrix cases and the FEAT_DotProd for the vector-by-matrix cases.
 - RHS matrix packing micro-kernels to pack the RHS matrix holding the QSI4C32 values.
 - Unit test and example for integer micro-kernels.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 748a9ae3..39fbabd5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,7 +92,6 @@ set(KLEIDIAI_FILES_NEON_FP16
 set(KLEIDIAI_FILES_NEON
     kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
     kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
-    kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c
 )
 
 set(KLEIDIAI_FILES_NEON_DOTPROD
diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel
index 7dbec375..8caf3e81 100644
--- a/kai/ukernels/matmul/BUILD.bazel
+++ b/kai/ukernels/matmul/BUILD.bazel
@@ -44,14 +44,8 @@ kai_c_library(
 
 kai_c_library(
     name = "clamp_f32_f32_f32p",
-    srcs = [
-        "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c",
-        "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c",
-    ],
-    hdrs = [
-        "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h",
-        "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h",
-    ],
+    srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c"],
+    hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"],
     cpu_uarch = kai_cpu_neon(),
 )
 
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c
deleted file mode 100644
index 73a71c39..00000000
--- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c
+++ /dev/null
@@ -1,315 +0,0 @@
-//
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#if !defined(__aarch64__)
-#error This file must be compiled for AArch64.
-#else  // Architectural features check.
-
-#include <math.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "kai/kai_common.h"
-
-static const size_t kai_mr = 1;
-static const size_t kai_nr = 8;
-static const size_t kai_kr = 1;
-static const size_t kai_sr = 1;
-
-size_t kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) {
-    return kai_mr;
-}
-
-size_t kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) {
-    return kai_nr;
-}
-
-size_t kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) {
-    return kai_nr;
-}
-
-size_t kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) {
-    return kai_kr;
-}
-
-size_t kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) {
-    return kai_sr;
-}
-
-size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t stride) {
-    KAI_ASSUME(m_idx % kai_mr == 0);
-
-    return m_idx * stride;
-}
-
-size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t n_idx, size_t k) {
-    KAI_ASSUME(n_idx % kai_nr == 0);
-
-    return n_idx / kai_nr * (kai_nr * sizeof(float) + kai_nr * k * sizeof(float));
-}
-
-size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(
-    size_t m_idx, size_t n_idx, size_t stride) {
-    KAI_ASSUME(m_idx % kai_mr == 0);
-    KAI_ASSUME(n_idx % kai_nr == 0);
-
-    return m_idx * stride + n_idx * sizeof(float);
-}
-
-size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m, size_t n) {
-    return m * n * sizeof(float);
-}
-
-void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(
-    size_t m, size_t n, size_t k,                             //
-    const void* lhs, size_t lhs_stride,                       //
-    const void* rhs_packed,                                   //
-    void* dst, size_t dst_stride_row, size_t dst_stride_col,  //
-    float clamp_min, float clamp_max) {
-    KAI_ASSERT(dst_stride_col == sizeof(float));
-
-    typedef struct {
-        float maxval;
-        float minval;
-        unsigned int num_strings;
-        const unsigned int* string_lengths;
-        size_t N;
-        const void* B_ptr;
-        size_t output_offset;
-        size_t input_initial_col;
-        size_t input_offset;
-        void* output_ptr;
-        const void* bias;
-    } KernelArgs;
-
-    KernelArgs ka;
-
-    unsigned long flags = 0;
-
-    unsigned int string_length = k;
-    ka.num_strings = 1;
-    ka.string_lengths = &string_length;
-    ka.N = n;
-    ka.B_ptr = rhs_packed;
-    ka.bias = NULL;
-
-    // Direct input.
-    const void* input_ptr = lhs;
-    ka.input_offset = lhs_stride / sizeof(float);
-    ka.input_initial_col = 0;
-
-    // Direct output.
-    ka.output_ptr = dst;
-    ka.output_offset = dst_stride_row / sizeof(float);
-
-    // Clamping output.
-    flags |= 0x2;
-    ka.maxval = clamp_max;
-    ka.minval = clamp_min;
-
-    __asm__ __volatile__(
-        "1:"  // Row loop
-        "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n"
-        "ldr x26, [%x[args_ptr], %[offsetof_output_ptr]]\n"
-        "mov x20, #0x4\n"
-        "ldr x25, [%x[args_ptr], %[offsetof_N]]\n"
-        "ldr x24, [%x[args_ptr], %[offsetof_B_ptr]]\n"
-        "madd x20, x21, x20, x26\n"
-        "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n"
-        "2:"  // Height 1: Column loop
-        "cbz x24, 3f\n"
-        "ldr q30, [x24, #0x0]\n"
-        "ldr q31, [x24, #0x10]\n"
-        "add x24, x24, #0x20\n"
-        "b 10f\n"
-        "3:"  // Height 1: no bias
-        "tbz %x[flags], #0, 9f\n"
-        "cmp x25, #0x8\n"
-        "bge 8f\n"
-        "tbz x25, #2, 5f\n"
-        "ld1 { v30.4s }, [x26], #0x10\n"
-        "tbz x25, #1, 4f\n"
-        "ldr d31, [x26], #0x8\n"
-        "mov x20, #0x18\n"
-        "tbz x25, #0, 7f\n"
-        "ld1 { v31.s }[2], [x26]\n"
-        "b 7f\n"
-        "4:"  // Height 1: Partial accumulate: partial_1_4
-        "mov x20, #0x10\n"
-        "tbz x25, #0, 7f\n"
-        "ldr s31, [x26, #0x0]\n"
-        "b 7f\n"
-        "5:"  // Height 1: Partial accumulate: partial_2_0
-        "tbz x25, #1, 6f\n"
-        "ldr d30, [x26], #0x8\n"
-        "mov x20, #0x8\n"
-        "tbz x25, #0, 7f\n"
-        "ld1 { v30.s }[2], [x26]\n"
-        "b 7f\n"
-        "6:"  // Height 1: Partial accumulate: partial_1_0
-        "ldr s30, [x26, #0x0]\n"
-        "mov x20, #0x0\n"
-        "7:"  // Height 1: Partial accumulate: Done
-        "sub x26, x26, x20\n"
-        "b 10f\n"
-        "8:"  // Height 1: full accumulate
-        "ldr q30, [x26, #0x0]\n"
-        "ldr q31, [x26, #0x10]\n"
-        "b 10f\n"
-        "9:"  // Height 1: no accumulate
-        "movi v30.16b, #0x0\n"
-        "movi v31.16b, #0x0\n"
-        "10:"  // Height 1: setup done
-        "mov x23, #0x0\n"
-        "11:"  // Height 1: String loop
-        "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
-        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
-        "ldr w22, [x20, x23, LSL #0x2]\n"
-        "tbz %x[flags], #3, 12f\n"
-        "ldr x20, [%x[input_ptr], x23, LSL #0x3]\n"
-        "add x20, x20, x21, LSL #3\n"
-        "ldr x21, [x20, #0x0]\n"
-        "cbnz x23, 13f\n"
-        "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
-        "add x21, x21, x20, LSL #2\n"
-        "b 13f\n"
-        "12:"  // Height 1: setup direct input
-        "mov x21, %x[input_ptr]\n"
-        "13:"  // Height 1: input setup done
-        "cmp x22, #0x4\n"
-        "blt 16f\n"
-        "ldr q0, [x21, #0x0]\n"
-        "ldr q1, [x24, #0x0]\n"
-        "cmp x22, #0x8\n"
-        "ldr q2, [x24, #0x10]\n"
-        "ldr q3, [x24, #0x20]\n"
-        "ldr q4, [x24, #0x30]\n"
-        "ldr q5, [x24, #0x40]\n"
-        "ldr q6, [x24, #0x50]\n"
-        "ldr q7, [x24, #0x60]\n"
-        "ldr q8, [x24, #0x70]\n"
-        "blt 15f\n"
-        "14:"  // Height 1: Multiply loop: Main loop head
-        "fmla v30.4s, v1.4s, v0.s[0]\n"
-        "fmla v31.4s, v2.4s, v0.s[0]\n"
-        "sub x22, x22, #0x4\n"
-        "add x21, x21, #0x10\n"
-        "cmp x22, #0x8\n"
-        "add x24, x24, #0x80\n"
-        "prfm pldl1keep, [x21, #0x80]\n"
-        "ldr q1, [x24, #0x0]\n"
-        "ldr q2, [x24, #0x10]\n"
-        "fmla v30.4s, v3.4s, v0.s[1]\n"
-        "ldr q3, [x24, #0x20]\n"
-        "fmla v31.4s, v4.4s, v0.s[1]\n"
-        "ldr q4, [x24, #0x30]\n"
-        "fmla v30.4s, v5.4s, v0.s[2]\n"
-        "ldr q5, [x24, #0x40]\n"
-        "fmla v31.4s, v6.4s, v0.s[2]\n"
-        "ldr q6, [x24, #0x50]\n"
-        "fmla v30.4s, v7.4s, v0.s[3]\n"
-        "ldr q7, [x24, #0x60]\n"
-        "fmla v31.4s, v8.4s, v0.s[3]\n"
-        "ldr q0, [x21, #0x0]\n"
-        "ldr q8, [x24, #0x70]\n"
-        "bge 14b\n"
-        "15:"  // Height 1: Multiply loop: Single iteration only
-        "fmla v30.4s, v1.4s, v0.s[0]\n"
-        "fmla v31.4s, v2.4s, v0.s[0]\n"
-        "add x21, x21, #0x10\n"
-        "sub x22, x22, #0x4\n"
-        "add x24, x24, #0x80\n"
-        "prfm pldl1keep, [x21, #0x80]\n"
-        "fmla v30.4s, v3.4s, v0.s[1]\n"
-        "fmla v31.4s, v4.4s, v0.s[1]\n"
-        "fmla v30.4s, v5.4s, v0.s[2]\n"
-        "fmla v31.4s, v6.4s, v0.s[2]\n"
-        "fmla v30.4s, v7.4s, v0.s[3]\n"
-        "fmla v31.4s, v8.4s, v0.s[3]\n"
-        "16:"  // Height 1: Multiply loop: Main loop skip
-        "cbz x22, 18f\n"
-        "17:"  // Height 1: Multiply loop: Odd block loop
-        "ldr s18, [x21], #0x4\n"
-        "ldr q17, [x24, #0x0]\n"
-        "sub x22, x22, #0x1\n"
-        "ldr q16, [x24, #0x10]\n"
-        "add x24, x24, #0x20\n"
-        "fmla v30.4s, v17.4s, v18.s[0]\n"
-        "fmla v31.4s, v16.4s, v18.s[0]\n"
-        "cbnz x22, 17b\n"
-        "18:"  // Height 1: Multiply loop: No odd multiplies
-        "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
-        "add x23, x23, #0x1\n"
-        "cmp x23, x20\n"
-        "bne 11b\n"
-        "prfm pstl1keep, [x26, #0x0]\n"
-        "tbz %x[flags], #1, 19f\n"
-        "add x21, %x[args_ptr], %[offset_max]\n"
-        "add x20, %x[args_ptr], %[offset_min]\n"
-        "ld1r { v17.4s }, [x21]\n"
-        "ld1r { v16.4s }, [x20]\n"
-        "fmin v30.4s, v30.4s, v17.4s\n"
-        "fmin v31.4s, v31.4s, v17.4s\n"
-        "fmax v30.4s, v30.4s, v16.4s\n"
-        "fmax v31.4s, v31.4s, v16.4s\n"
-        "19:"  // Height 1: No activation
-        "cmp x25, #0x8\n"
-        "bge 24f\n"
-        "tbz x25, #2, 21f\n"
-        "st1 { v30.4s }, [x26], #0x10\n"
-        "tbz x25, #1, 20f\n"
-        "str d31, [x26], #0x8\n"
-        "tbz x25, #0, 23f\n"
-        "st1 { v31.s }[2], [x26]\n"
-        "b 23f\n"
-        "20:"  // Height 1: Partial direct writeback: partial_1_4
-        "tbz x25, #0, 23f\n"
-        "str s31, [x26, #0x0]\n"
-        "b 23f\n"
-        "21:"  // Height 1: Partial direct writeback: partial_2_0
-        "tbz x25, #1, 22f\n"
-        "str d30, [x26], #0x8\n"
-        "tbz x25, #0, 23f\n"
-        "st1 { v30.s }[2], [x26]\n"
-        "b 23f\n"
-        "22:"  // Height 1: Partial direct writeback: partial_1_0
-        "str s30, [x26, #0x0]\n"
-        "23:"  // Height 1: Partial direct writeback: Done
-        "b 25f\n"
-        "24:"  // Height 1: Full writeback
-        "str q30, [x26, #0x0]\n"
-        "str q31, [x26, #0x10]\n"
-        "add x26, x26, #0x20\n"
-        "25:"  // Height 1: Writeback done
-        "subs x25, x25, #0x8\n"
-        "bgt 2b\n"
-        "subs %x[m], %x[m], #0x1\n"
-        "beq 27f\n"
-        "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
-        "tbz %x[flags], #3, 26f\n"
-        "add x21, x21, #0x1\n"
-        "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n"
-        "b 1b\n"
-        "26:"  // Update direct input
-        "mov x20, #0x4\n"
-        "madd %x[input_ptr], x20, x21, %x[input_ptr]\n"
-        "b 1b\n"
-        "27:"  // Exit
-        : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m)
-        : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)),
-          [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)),
-          [offsetof_N] "I"(offsetof(KernelArgs, N)),
-          [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)),
-          [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)),
-          [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)),
-          [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)),
-          [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)),
-          [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths))
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v30", "v31",
-          "x20", "x21", "x22", "x23", "x24", "x25", "x26");
-}
-
-#endif  // Architectural features check.
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h
deleted file mode 100644
index acdd292d..00000000
--- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#if !defined(__aarch64__)
-#error This file must be compiled for AArch64.
-#else  // Architectural features check.
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/// Micro-kernel dependencies
-///
-/// -# kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon to pack the RHS matrix
-
-/// --------------------------------------------------
-
-/// Gets m step value.
-///
-/// The starting row index must be divisible by `m_step`.
-///
-/// @return The m step value.
-size_t kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void);
-
-/// Gets n step value.
-///
-/// The starting column index must be divisible by `n_step`.
-///
-/// @return The n step value.
-size_t kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void);
-
-/// Gets nr value.
-///
-/// This is the packing parameter which must be used to pack the RHS matrix.
-///
-/// @return The nr value.
-size_t kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void);
-
-/// Gets kr value.
-///
-/// This is the packing parameter which must be used to pack the RHS matrix.
-///
-/// @return The kr value.
-size_t kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void);
-
-/// Gets sr value.
-///
-/// This is the packing parameter which must be used to pack the RHS matrix.
-///
-/// @return The sr value.
-size_t kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void);
-
-/// Gets the offset in bytes to the data element in the LHS matrix buffer.
-///
-/// @param[in] m_idx Row index.
-/// @param[in] stride Row stride in bytes.
-///
-/// @return The offset in bytes to the data element.
-size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t stride);
-
-/// Gets the offset in bytes to the data element in the packed RHS matrix buffer.
-///
-/// @param[in] n_idx Row index.
-/// @param[in] k Number of columns.
-///
-/// @return The offset in bytes to the data element.
-size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t n_idx, size_t k);
-
-/// Gets the offset in bytes to the data element in the destination matrix buffer.
-///
-/// @param[in] m_idx Row index.
-/// @param[in] n_idx Column index.
-/// @param[in] stride Row stride in bytes.
-///
-/// @return The offset in bytes to the data element.
-size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t n_idx, size_t stride);
-
-/// Gets the size in bytes of the destination matrix buffer.
-///
-/// @param[in] m Number of rows.
-/// @param[in] n Number of columns.
-///
-/// @return The size in bytes of the destination matrix buffer.
-size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m, size_t n);
-
-/// Runs the matrix multiplication microkernel followed by a clamp operation.
-///
-/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset
-/// calculated using the following functions:
-///
-///   * LHS: @ref kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.
-///   * Packed RHS: @ref kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.
-///   * Output: @ref kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.
-///
-/// @param[in]  m Number of output rows to be computed.
-/// @param[in]  n Number of output columns to be computed.
-/// @param[in]  k Common dimension of the LHS and RHS operand.
-/// @param[in]  lhs LHS matrix buffer.
-/// @param[in]  lhs_stride Row stride in bytes of the LHS matrix.
-/// @param[in]  rhs_packed Packed RHS buffer.
-/// @param[out] dst Output matrix buffer.
-/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
-/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
-/// @param[in]  clamp_min Minimum value to clamp the final result.
-/// @param[in]  clamp_max Maximum value to clamp the final result.
-void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(
-    size_t m, size_t n, size_t k,                             //
-    const void* lhs, size_t lhs_stride,                       //
-    const void* rhs_packed,                                   //
-    void* dst, size_t dst_stride_row, size_t dst_stride_col,  //
-    float clamp_min, float clamp_max);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // Architectural features check.
diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp
index 3e68bac0..1c7e15ba 100644
--- a/test/tests/matmul_test.cpp
+++ b/test/tests/matmul_test.cpp
@@ -45,10 +45,8 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h"
 
 // matmul_clamp_f32_f32_f32p
-#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
-
 namespace kai::test {
 
 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
@@ -389,55 +387,6 @@ static const std::array matmul_methods = {
         .fn_matmul_f32_f32p_f32p = nullptr,
     },
 
-    MatMulMethod{
-        .name = "matmul_nt_nt_fp32_fp32_fp32_1x8_neon_mla",
-
-        .m0 = 1,
-        .n0 = 8,
-
-        .lhs_transposed = false,
-        .rhs_transposed = false,
-
-        .is_sme2 = false,
-
-        .dst_format = DataFormat(DataType::FP32),
-        .lhs_format = DataFormat(DataType::FP32),
-        .packed_lhs_format = DataFormat(DataType::UNKNOWN),
-        .rhs_format = DataFormat(DataType::FP32),
-        .packed_rhs_format = DataFormat(
-            DataType::FP32, 8, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP32, DataType::UNKNOWN, 8, 1),
-        .bias_format = DataFormat(DataType::FP32),
-
-        .fn_get_mr = nullptr,
-        .fn_get_nr = kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_get_kr = kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_get_sr = kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-
-        .fn_get_main_m_step = kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_get_pack_rhs_n_step = kai_get_n_step_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-        .fn_get_main_n_step = kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-
-        .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_get_packed_lhs_size = nullptr,
-        .fn_get_packed_lhs_offset = nullptr,
-        .fn_pack_lhs = nullptr,
-
-        .fn_get_rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-        .fn_get_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-        .fn_get_pack_rhs_packed_rhs_offset = kai_get_rhs_packed_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-        .fn_get_main_packed_rhs_offset = kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_pack_rhs = kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-
-        .fn_get_bias_offset = kai_get_bias_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon,
-
-        .fn_get_dst_offset = kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_get_dst_size = kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-
-        .fn_matmul_f16_f16_f16p = nullptr,
-        .fn_matmul_f32_f32_f32p = kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla,
-        .fn_matmul_f32_f32p_f32p = nullptr,
-    },
-
     MatMulMethod{
         .name = "matmul_nt_nt_fp16_fp16_fp16_6x16_neon_mla",
 
-- 
GitLab