From 1a087907afc500c4ae7eef8f2ff0a0029d802625 Mon Sep 17 00:00:00 2001 From: Jakub Sujak Date: Sun, 22 Sep 2024 23:31:35 +0100 Subject: [PATCH] Revert "Add FP32 GEMV micro kernel" This reverts commit f4f59599 The existing FP32 GEMM micro-kernel (matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla) has a dedicated path for M=1 (a "GEMV" operation). Signed-off-by: Jakub Sujak --- CHANGELOG.md | 2 +- CMakeLists.txt | 1 - kai/ukernels/matmul/BUILD.bazel | 10 +- ...mp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c | 315 ------------------ ...mp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h | 124 ------- test/tests/matmul_test.cpp | 51 --- 6 files changed, 3 insertions(+), 500 deletions(-) delete mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c delete mode 100644 kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 5428c542..4831fd13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo ## v0.3.0 - Upcoming Release -- Advanced SIMD FP32 GEMM and GEMV micro kernels +- Advanced SIMD FP32 GEMM micro-kernel. - Micro-kernels to compute the matrix multiplication of dynamically quantized asymmetric signed 8-bit integer with per-row quantization (QAI8DX) LHS and quantized symmetric 4-bit signed integer with per-block quantization (QSI4C32) RHS. The destination matrix data type is single-precision floating-point (F32). The micro-kernels have been optimized using the ArmĀ® CPU feature FEAT_I8MM for the matrix-by-matrix cases and the FEAT_DotProd for the vector-by-matrix cases. - RHS matrix packing micro-kernels to pack the RHS matrix holding the QSI4C32 values. - Unit test and example for integer micro-kernels. diff --git a/CMakeLists.txt b/CMakeLists.txt index 748a9ae3..39fbabd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,7 +92,6 @@ set(KLEIDIAI_FILES_NEON_FP16 set(KLEIDIAI_FILES_NEON kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c - kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c ) set(KLEIDIAI_FILES_NEON_DOTPROD diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 7dbec375..8caf3e81 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -44,14 +44,8 @@ kai_c_library( kai_c_library( name = "clamp_f32_f32_f32p", - srcs = [ - "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c", - "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c", - ], - hdrs = [ - "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h", - "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h", - ], + srcs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c"], + hdrs = ["matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"], cpu_uarch = kai_cpu_neon(), ) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c deleted file mode 100644 index 73a71c39..00000000 --- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.c +++ /dev/null @@ -1,315 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#if !defined(__aarch64__) -#error This file must be compiled for AArch64. -#else // Architectural features check. - -#include -#include -#include - -#include "kai/kai_common.h" - -static const size_t kai_mr = 1; -static const size_t kai_nr = 8; -static const size_t kai_kr = 1; -static const size_t kai_sr = 1; - -size_t kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) { - return kai_mr; -} - -size_t kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) { - return kai_nr; -} - -size_t kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) { - return kai_nr; -} - -size_t kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) { - return kai_kr; -} - -size_t kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void) { - return kai_sr; -} - -size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t stride) { - KAI_ASSUME(m_idx % kai_mr == 0); - - return m_idx * stride; -} - -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t n_idx, size_t k) { - KAI_ASSUME(n_idx % kai_nr == 0); - - return n_idx / kai_nr * (kai_nr * sizeof(float) + kai_nr * k * sizeof(float)); -} - -size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla( - size_t m_idx, size_t n_idx, size_t stride) { - KAI_ASSUME(m_idx % kai_mr == 0); - KAI_ASSUME(n_idx % kai_nr == 0); - - return m_idx * stride + n_idx * sizeof(float); -} - -size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m, size_t n) { - return m * n * sizeof(float); -} - -void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla( - size_t m, size_t n, size_t k, // - const void* lhs, size_t lhs_stride, // - const void* rhs_packed, // - void* dst, size_t dst_stride_row, size_t dst_stride_col, // - float clamp_min, float clamp_max) { - KAI_ASSERT(dst_stride_col == sizeof(float)); - - typedef struct { - float maxval; - float minval; - unsigned int num_strings; - const unsigned int* string_lengths; - size_t N; - const void* B_ptr; - size_t output_offset; - size_t input_initial_col; - size_t input_offset; - void* output_ptr; - const void* bias; - } KernelArgs; - - KernelArgs ka; - - unsigned long flags = 0; - - unsigned int string_length = k; - ka.num_strings = 1; - ka.string_lengths = &string_length; - ka.N = n; - ka.B_ptr = rhs_packed; - ka.bias = NULL; - - // Direct input. - const void* input_ptr = lhs; - ka.input_offset = lhs_stride / sizeof(float); - ka.input_initial_col = 0; - - // Direct output. - ka.output_ptr = dst; - ka.output_offset = dst_stride_row / sizeof(float); - - // Clamping output. - flags |= 0x2; - ka.maxval = clamp_max; - ka.minval = clamp_min; - - __asm__ __volatile__( - "1:" // Row loop - "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ldr x26, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x20, #0x4\n" - "ldr x25, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x24, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x26\n" - "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "2:" // Height 1: Column loop - "cbz x24, 3f\n" - "ldr q30, [x24, #0x0]\n" - "ldr q31, [x24, #0x10]\n" - "add x24, x24, #0x20\n" - "b 10f\n" - "3:" // Height 1: no bias - "tbz %x[flags], #0, 9f\n" - "cmp x25, #0x8\n" - "bge 8f\n" - "tbz x25, #2, 5f\n" - "ld1 { v30.4s }, [x26], #0x10\n" - "tbz x25, #1, 4f\n" - "ldr d31, [x26], #0x8\n" - "mov x20, #0x18\n" - "tbz x25, #0, 7f\n" - "ld1 { v31.s }[2], [x26]\n" - "b 7f\n" - "4:" // Height 1: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x25, #0, 7f\n" - "ldr s31, [x26, #0x0]\n" - "b 7f\n" - "5:" // Height 1: Partial accumulate: partial_2_0 - "tbz x25, #1, 6f\n" - "ldr d30, [x26], #0x8\n" - "mov x20, #0x8\n" - "tbz x25, #0, 7f\n" - "ld1 { v30.s }[2], [x26]\n" - "b 7f\n" - "6:" // Height 1: Partial accumulate: partial_1_0 - "ldr s30, [x26, #0x0]\n" - "mov x20, #0x0\n" - "7:" // Height 1: Partial accumulate: Done - "sub x26, x26, x20\n" - "b 10f\n" - "8:" // Height 1: full accumulate - "ldr q30, [x26, #0x0]\n" - "ldr q31, [x26, #0x10]\n" - "b 10f\n" - "9:" // Height 1: no accumulate - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "10:" // Height 1: setup done - "mov x23, #0x0\n" - "11:" // Height 1: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w22, [x20, x23, LSL #0x2]\n" - "tbz %x[flags], #3, 12f\n" - "ldr x20, [%x[input_ptr], x23, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x21, [x20, #0x0]\n" - "cbnz x23, 13f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x21, x21, x20, LSL #2\n" - "b 13f\n" - "12:" // Height 1: setup direct input - "mov x21, %x[input_ptr]\n" - "13:" // Height 1: input setup done - "cmp x22, #0x4\n" - "blt 16f\n" - "ldr q0, [x21, #0x0]\n" - "ldr q1, [x24, #0x0]\n" - "cmp x22, #0x8\n" - "ldr q2, [x24, #0x10]\n" - "ldr q3, [x24, #0x20]\n" - "ldr q4, [x24, #0x30]\n" - "ldr q5, [x24, #0x40]\n" - "ldr q6, [x24, #0x50]\n" - "ldr q7, [x24, #0x60]\n" - "ldr q8, [x24, #0x70]\n" - "blt 15f\n" - "14:" // Height 1: Multiply loop: Main loop head - "fmla v30.4s, v1.4s, v0.s[0]\n" - "fmla v31.4s, v2.4s, v0.s[0]\n" - "sub x22, x22, #0x4\n" - "add x21, x21, #0x10\n" - "cmp x22, #0x8\n" - "add x24, x24, #0x80\n" - "prfm pldl1keep, [x21, #0x80]\n" - "ldr q1, [x24, #0x0]\n" - "ldr q2, [x24, #0x10]\n" - "fmla v30.4s, v3.4s, v0.s[1]\n" - "ldr q3, [x24, #0x20]\n" - "fmla v31.4s, v4.4s, v0.s[1]\n" - "ldr q4, [x24, #0x30]\n" - "fmla v30.4s, v5.4s, v0.s[2]\n" - "ldr q5, [x24, #0x40]\n" - "fmla v31.4s, v6.4s, v0.s[2]\n" - "ldr q6, [x24, #0x50]\n" - "fmla v30.4s, v7.4s, v0.s[3]\n" - "ldr q7, [x24, #0x60]\n" - "fmla v31.4s, v8.4s, v0.s[3]\n" - "ldr q0, [x21, #0x0]\n" - "ldr q8, [x24, #0x70]\n" - "bge 14b\n" - "15:" // Height 1: Multiply loop: Single iteration only - "fmla v30.4s, v1.4s, v0.s[0]\n" - "fmla v31.4s, v2.4s, v0.s[0]\n" - "add x21, x21, #0x10\n" - "sub x22, x22, #0x4\n" - "add x24, x24, #0x80\n" - "prfm pldl1keep, [x21, #0x80]\n" - "fmla v30.4s, v3.4s, v0.s[1]\n" - "fmla v31.4s, v4.4s, v0.s[1]\n" - "fmla v30.4s, v5.4s, v0.s[2]\n" - "fmla v31.4s, v6.4s, v0.s[2]\n" - "fmla v30.4s, v7.4s, v0.s[3]\n" - "fmla v31.4s, v8.4s, v0.s[3]\n" - "16:" // Height 1: Multiply loop: Main loop skip - "cbz x22, 18f\n" - "17:" // Height 1: Multiply loop: Odd block loop - "ldr s18, [x21], #0x4\n" - "ldr q17, [x24, #0x0]\n" - "sub x22, x22, #0x1\n" - "ldr q16, [x24, #0x10]\n" - "add x24, x24, #0x20\n" - "fmla v30.4s, v17.4s, v18.s[0]\n" - "fmla v31.4s, v16.4s, v18.s[0]\n" - "cbnz x22, 17b\n" - "18:" // Height 1: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x23, x23, #0x1\n" - "cmp x23, x20\n" - "bne 11b\n" - "prfm pstl1keep, [x26, #0x0]\n" - "tbz %x[flags], #1, 19f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v30.4s, v30.4s, v17.4s\n" - "fmin v31.4s, v31.4s, v17.4s\n" - "fmax v30.4s, v30.4s, v16.4s\n" - "fmax v31.4s, v31.4s, v16.4s\n" - "19:" // Height 1: No activation - "cmp x25, #0x8\n" - "bge 24f\n" - "tbz x25, #2, 21f\n" - "st1 { v30.4s }, [x26], #0x10\n" - "tbz x25, #1, 20f\n" - "str d31, [x26], #0x8\n" - "tbz x25, #0, 23f\n" - "st1 { v31.s }[2], [x26]\n" - "b 23f\n" - "20:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x25, #0, 23f\n" - "str s31, [x26, #0x0]\n" - "b 23f\n" - "21:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x25, #1, 22f\n" - "str d30, [x26], #0x8\n" - "tbz x25, #0, 23f\n" - "st1 { v30.s }[2], [x26]\n" - "b 23f\n" - "22:" // Height 1: Partial direct writeback: partial_1_0 - "str s30, [x26, #0x0]\n" - "23:" // Height 1: Partial direct writeback: Done - "b 25f\n" - "24:" // Height 1: Full writeback - "str q30, [x26, #0x0]\n" - "str q31, [x26, #0x10]\n" - "add x26, x26, #0x20\n" - "25:" // Height 1: Writeback done - "subs x25, x25, #0x8\n" - "bgt 2b\n" - "subs %x[m], %x[m], #0x1\n" - "beq 27f\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 26f\n" - "add x21, x21, #0x1\n" - "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "b 1b\n" - "26:" // Update direct input - "mov x20, #0x4\n" - "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" - "b 1b\n" - "27:" // Exit - : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) - : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), - [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), - [offsetof_N] "I"(offsetof(KernelArgs, N)), - [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), - [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), - [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), - [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), - [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), - [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v30", "v31", - "x20", "x21", "x22", "x23", "x24", "x25", "x26"); -} - -#endif // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h deleted file mode 100644 index acdd292d..00000000 --- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h +++ /dev/null @@ -1,124 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#if !defined(__aarch64__) -#error This file must be compiled for AArch64. -#else // Architectural features check. - -#include - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/// Micro-kernel dependencies -/// -/// -# kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon to pack the RHS matrix - -/// -------------------------------------------------- - -/// Gets m step value. -/// -/// The starting row index must be divisible by `m_step`. -/// -/// @return The m step value. -size_t kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void); - -/// Gets n step value. -/// -/// The starting column index must be divisible by `n_step`. -/// -/// @return The n step value. -size_t kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void); - -/// Gets nr value. -/// -/// This is the packing parameter which must be used to pack the RHS matrix. -/// -/// @return The nr value. -size_t kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void); - -/// Gets kr value. -/// -/// This is the packing parameter which must be used to pack the RHS matrix. -/// -/// @return The kr value. -size_t kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void); - -/// Gets sr value. -/// -/// This is the packing parameter which must be used to pack the RHS matrix. -/// -/// @return The sr value. -size_t kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(void); - -/// Gets the offset in bytes to the data element in the LHS matrix buffer. -/// -/// @param[in] m_idx Row index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t stride); - -/// Gets the offset in bytes to the data element in the packed RHS matrix buffer. -/// -/// @param[in] n_idx Row index. -/// @param[in] k Number of columns. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t n_idx, size_t k); - -/// Gets the offset in bytes to the data element in the destination matrix buffer. -/// -/// @param[in] m_idx Row index. -/// @param[in] n_idx Column index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m_idx, size_t n_idx, size_t stride); - -/// Gets the size in bytes of the destination matrix buffer. -/// -/// @param[in] m Number of rows. -/// @param[in] n Number of columns. -/// -/// @return The size in bytes of the destination matrix buffer. -size_t kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla(size_t m, size_t n); - -/// Runs the matrix multiplication microkernel followed by a clamp operation. -/// -/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset -/// calculated using the following functions: -/// -/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla. -/// * Packed RHS: @ref kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla. -/// * Output: @ref kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla. -/// -/// @param[in] m Number of output rows to be computed. -/// @param[in] n Number of output columns to be computed. -/// @param[in] k Common dimension of the LHS and RHS operand. -/// @param[in] lhs LHS matrix buffer. -/// @param[in] lhs_stride Row stride in bytes of the LHS matrix. -/// @param[in] rhs_packed Packed RHS buffer. -/// @param[out] dst Output matrix buffer. -/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. -/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) -/// @param[in] clamp_min Minimum value to clamp the final result. -/// @param[in] clamp_max Maximum value to clamp the final result. -void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla( - size_t m, size_t n, size_t k, // - const void* lhs, size_t lhs_stride, // - const void* rhs_packed, // - void* dst, size_t dst_stride_row, size_t dst_stride_col, // - float clamp_min, float clamp_max); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // Architectural features check. diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index 3e68bac0..1c7e15ba 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -45,10 +45,8 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h" // matmul_clamp_f32_f32_f32p -#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h" - namespace kai::test { // NOLINTBEGIN(misc-non-private-member-variables-in-classes) @@ -389,55 +387,6 @@ static const std::array matmul_methods = { .fn_matmul_f32_f32p_f32p = nullptr, }, - MatMulMethod{ - .name = "matmul_nt_nt_fp32_fp32_fp32_1x8_neon_mla", - - .m0 = 1, - .n0 = 8, - - .lhs_transposed = false, - .rhs_transposed = false, - - .is_sme2 = false, - - .dst_format = DataFormat(DataType::FP32), - .lhs_format = DataFormat(DataType::FP32), - .packed_lhs_format = DataFormat(DataType::UNKNOWN), - .rhs_format = DataFormat(DataType::FP32), - .packed_rhs_format = DataFormat( - DataType::FP32, 8, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP32, DataType::UNKNOWN, 8, 1), - .bias_format = DataFormat(DataType::FP32), - - .fn_get_mr = nullptr, - .fn_get_nr = kai_get_nr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_get_kr = kai_get_kr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_get_sr = kai_get_sr_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - - .fn_get_main_m_step = kai_get_m_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_get_pack_rhs_n_step = kai_get_n_step_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - .fn_get_main_n_step = kai_get_n_step_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - - .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_get_packed_lhs_size = nullptr, - .fn_get_packed_lhs_offset = nullptr, - .fn_pack_lhs = nullptr, - - .fn_get_rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - .fn_get_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - .fn_get_pack_rhs_packed_rhs_offset = kai_get_rhs_packed_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - .fn_get_main_packed_rhs_offset = kai_get_rhs_packed_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_pack_rhs = kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - - .fn_get_bias_offset = kai_get_bias_offset_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon, - - .fn_get_dst_offset = kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_get_dst_size = kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - - .fn_matmul_f16_f16_f16p = nullptr, - .fn_matmul_f32_f32_f32p = kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_1x8x4_neon_mla, - .fn_matmul_f32_f32p_f32p = nullptr, - }, - MatMulMethod{ .name = "matmul_nt_nt_fp16_fp16_fp16_6x16_neon_mla", -- GitLab