From ae9494acca8db0987411344ed556f2a25c49bf6f Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 25 Sep 2024 10:19:42 +0100 Subject: [PATCH 01/12] Add int4 matrix multiplication kernels with dotprod extension Signed-off-by: Anitha Raj --- CMakeLists.txt | 2 + .../CMakeLists.txt | 5 +- .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp | 26 + ...i8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c | 732 ++++++++++++++++ ...i8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h | 137 +++ ...ai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c | 815 ++++++++++++++++++ ...ai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h | 137 +++ 7 files changed, 1853 insertions(+), 1 deletion(-) create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c create mode 100644 kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9dc92e3a..558c5011 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,8 @@ set(KLEIDIAI_FILES_NEON set(KLEIDIAI_FILES_NEON_DOTPROD kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c + kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c + kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt index 8993c16f..7518d84e 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt @@ -28,7 +28,10 @@ add_executable(matmul_clamp_f32_qai8dxp_qsi4cxp ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c - ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c) + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c + ) # Compile with DotProd and I8MM features enabled target_compile_options(matmul_clamp_f32_qai8dxp_qsi4cxp PRIVATE -march=armv8.2-a+dotprod+i8mm) diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp index fe75aa58..e8db11cd 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp @@ -20,7 +20,9 @@ #include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h" #include "kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" #include "kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.h" @@ -113,6 +115,30 @@ kai_matmul_ukernel_f32_qa8dxp_qs4cxp ukernel_variants[] = { kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm, kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm, "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod"}, }; diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c new file mode 100644 index 00000000..741a6b8c --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c @@ -0,0 +1,732 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#if !defined(__ARM_FEATURE_DOTPROD) +#error "Dotprod extension required to compile this micro-kernel" +#else // Architectural features check. +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" + +#include +#include +#include + +#include "kai/kai_common.h" + +static const size_t kai_m_step = 16; +static const size_t kai_n_step = 8; +static const size_t kai_mr = 4; +static const size_t kai_nr = 8; +static const size_t kai_kr = 16; +static const size_t kai_sr = 2; +static const size_t kai_num_bytes_multiplier_lhs = sizeof(float); +static const size_t kai_num_bytes_multiplier_rhs = sizeof(float); +static const size_t kai_num_bytes_offset_lhs = sizeof(int32_t); +static const size_t kai_num_bytes_sum_rhs = sizeof(int32_t); +static const size_t kai_num_bytes_bias = sizeof(float); + +inline static size_t kai_k_roundedup(size_t k) { + // Since we pack a float and int32 value at the end of the row, + // we must make sure that k is a multiple of 4 for alignment + size_t kr_sr_roundedup4 = kai_roundup(kai_kr * kai_sr, 4); + return kai_roundup(k, kr_sr_roundedup4); +} + +inline static size_t kai_lhs_packed_stride(size_t k) { + const size_t k_internal = kai_k_roundedup(k); + + KAI_ASSERT((k_internal % 2) == 0); + + return kai_mr * (k_internal * sizeof(int8_t) + kai_num_bytes_multiplier_lhs + kai_num_bytes_offset_lhs); +} + +inline static size_t kai_rhs_packed_stride(size_t k) { + const size_t k_internal = kai_k_roundedup(k); + + KAI_ASSERT((k_internal % 2) == 0); + + return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias); +} + +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_m_step; +} + +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_n_step; +} + +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_mr; +} + +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_nr; +} + +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_kr; +} + +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { + return kai_sr; +} + +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m_idx, size_t k) { + KAI_ASSERT((m_idx % kai_m_step) == 0); + + return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); +} + +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t n_idx, size_t k) { + KAI_ASSERT((n_idx % kai_n_step) == 0); + + return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); +} + +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( + size_t m_idx, size_t n_idx, size_t dst_stride) { + KAI_ASSERT((m_idx % kai_m_step) == 0); + KAI_ASSERT((n_idx % kai_n_step) == 0); + + return (n_idx * sizeof(float)) + m_idx * dst_stride; +} + +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m, size_t n) { + return m * n * sizeof(float); +} + +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( + size_t m, size_t n, size_t k, const void* restrict lhs_packed, const void* restrict rhs_packed, float* restrict dst, + size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) { + KAI_ASSERT(dst_stride_col == sizeof(float)); + + if (m == 0) { + return; + } + + const size_t k_internal = kai_k_roundedup(k); + + size_t num_blocks = k_internal / 32; + + float clamp_vals[2] = {scalar_min, scalar_max}; + __asm__ __volatile__( + "mov x13, %x[m]\n" + "mov x12, #0x80\n" + "mov x20, #0x20\n" + "cmp x13, #0x10\n" + "madd x12, %x[num_blocks], x12, x20\n" + "blt 14f\n" + "1:" // Row loop + "mov x11, %x[rhs_packed]\n" + "mov x10, %x[n]\n" + "add x9, %x[dst], %x[dst_stride_row], LSL #4\n" + "2:" // Column loop + "mov x27, %x[lhs_packed]\n" + "movi v31.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "mov x23, %x[num_blocks]\n" + "movi v29.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "add x22, x27, x12\n" + "add x21, x22, x12\n" + "add x20, x21, x12\n" + "movi v25.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "3:" // Sub block loop + "ldr q13, [x11, #0x0]\n" + "ldr q14, [x27, #0x0]\n" + "movi v10.16b, #0xf0\n" + "subs x23, x23, #0x1\n" + "ldr q6, [x22, #0x0]\n" + "ldr q15, [x21, #0x0]\n" + "ldr q3, [x20, #0x0]\n" + "ldr q12, [x11, #0x10]\n" + "ldr q8, [x27, #0x10]\n" + "ldr q7, [x22, #0x10]\n" + "shl v9.16b, v13.16b, #0x4\n" + "and v13.16b, v13.16b, v10.16b\n" + "ldr q2, [x21, #0x10]\n" + "ldr q1, [x20, #0x10]\n" + "ldr q5, [x11, #0x20]\n" + "ldr q4, [x27, #0x20]\n" + "shl v0.16b, v12.16b, #0x4\n" + "and v12.16b, v12.16b, v10.16b\n" + "ldr q11, [x22, #0x20]\n" + ".inst 0x4f8ee13f // sdot v31.4s, v9.16b, v14.4b[0]\n" + ".inst 0x4faee13e // sdot v30.4s, v9.16b, v14.4b[1]\n" + ".inst 0x4f8ee93d // sdot v29.4s, v9.16b, v14.4b[2]\n" + ".inst 0x4faee93c // sdot v28.4s, v9.16b, v14.4b[3]\n" + "ldr q14, [x21, #0x20]\n" + ".inst 0x4f86e13b // sdot v27.4s, v9.16b, v6.4b[0]\n" + ".inst 0x4fa6e13a // sdot v26.4s, v9.16b, v6.4b[1]\n" + ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n" + ".inst 0x4fa6e938 // sdot v24.4s, v9.16b, v6.4b[3]\n" + "ldr q6, [x20, #0x20]\n" + ".inst 0x4f8fe137 // sdot v23.4s, v9.16b, v15.4b[0]\n" + ".inst 0x4fafe136 // sdot v22.4s, v9.16b, v15.4b[1]\n" + ".inst 0x4f8fe935 // sdot v21.4s, v9.16b, v15.4b[2]\n" + ".inst 0x4fafe934 // sdot v20.4s, v9.16b, v15.4b[3]\n" + "ldr q15, [x11, #0x30]\n" + "add x11, x11, #0x40\n" + ".inst 0x4f83e133 // sdot v19.4s, v9.16b, v3.4b[0]\n" + ".inst 0x4fa3e132 // sdot v18.4s, v9.16b, v3.4b[1]\n" + ".inst 0x4f83e931 // sdot v17.4s, v9.16b, v3.4b[2]\n" + ".inst 0x4fa3e930 // sdot v16.4s, v9.16b, v3.4b[3]\n" + "ldr q9, [x27, #0x30]\n" + "ldr q3, [x22, #0x30]\n" + ".inst 0x4f88e01f // sdot v31.4s, v0.16b, v8.4b[0]\n" + ".inst 0x4fa8e01e // sdot v30.4s, v0.16b, v8.4b[1]\n" + ".inst 0x4f88e81d // sdot v29.4s, v0.16b, v8.4b[2]\n" + ".inst 0x4fa8e81c // sdot v28.4s, v0.16b, v8.4b[3]\n" + "ldr q8, [x21, #0x30]\n" + ".inst 0x4f87e01b // sdot v27.4s, v0.16b, v7.4b[0]\n" + ".inst 0x4fa7e01a // sdot v26.4s, v0.16b, v7.4b[1]\n" + ".inst 0x4f87e819 // sdot v25.4s, v0.16b, v7.4b[2]\n" + ".inst 0x4fa7e818 // sdot v24.4s, v0.16b, v7.4b[3]\n" + "ldr q7, [x20, #0x30]\n" + ".inst 0x4f82e017 // sdot v23.4s, v0.16b, v2.4b[0]\n" + ".inst 0x4fa2e016 // sdot v22.4s, v0.16b, v2.4b[1]\n" + ".inst 0x4f82e815 // sdot v21.4s, v0.16b, v2.4b[2]\n" + ".inst 0x4fa2e814 // sdot v20.4s, v0.16b, v2.4b[3]\n" + "ldr q2, [x27, #0x40]\n" + ".inst 0x4f81e013 // sdot v19.4s, v0.16b, v1.4b[0]\n" + ".inst 0x4fa1e012 // sdot v18.4s, v0.16b, v1.4b[1]\n" + ".inst 0x4f81e811 // sdot v17.4s, v0.16b, v1.4b[2]\n" + ".inst 0x4fa1e810 // sdot v16.4s, v0.16b, v1.4b[3]\n" + "ldr q0, [x22, #0x40]\n" + "shl v1.16b, v5.16b, #0x4\n" + "and v5.16b, v5.16b, v10.16b\n" + ".inst 0x4f84e03f // sdot v31.4s, v1.16b, v4.4b[0]\n" + ".inst 0x4fa4e03e // sdot v30.4s, v1.16b, v4.4b[1]\n" + ".inst 0x4f84e83d // sdot v29.4s, v1.16b, v4.4b[2]\n" + ".inst 0x4fa4e83c // sdot v28.4s, v1.16b, v4.4b[3]\n" + "ldr q4, [x21, #0x40]\n" + ".inst 0x4f8be03b // sdot v27.4s, v1.16b, v11.4b[0]\n" + ".inst 0x4fabe03a // sdot v26.4s, v1.16b, v11.4b[1]\n" + ".inst 0x4f8be839 // sdot v25.4s, v1.16b, v11.4b[2]\n" + ".inst 0x4fabe838 // sdot v24.4s, v1.16b, v11.4b[3]\n" + "ldr q11, [x20, #0x40]\n" + ".inst 0x4f8ee037 // sdot v23.4s, v1.16b, v14.4b[0]\n" + ".inst 0x4faee036 // sdot v22.4s, v1.16b, v14.4b[1]\n" + ".inst 0x4f8ee835 // sdot v21.4s, v1.16b, v14.4b[2]\n" + ".inst 0x4faee834 // sdot v20.4s, v1.16b, v14.4b[3]\n" + "ldr q14, [x27, #0x50]\n" + ".inst 0x4f86e033 // sdot v19.4s, v1.16b, v6.4b[0]\n" + ".inst 0x4fa6e032 // sdot v18.4s, v1.16b, v6.4b[1]\n" + ".inst 0x4f86e831 // sdot v17.4s, v1.16b, v6.4b[2]\n" + ".inst 0x4fa6e830 // sdot v16.4s, v1.16b, v6.4b[3]\n" + "ldr q6, [x22, #0x50]\n" + "shl v1.16b, v15.16b, #0x4\n" + "and v15.16b, v15.16b, v10.16b\n" + "ldr q10, [x21, #0x50]\n" + ".inst 0x4f89e03f // sdot v31.4s, v1.16b, v9.4b[0]\n" + ".inst 0x4fa9e03e // sdot v30.4s, v1.16b, v9.4b[1]\n" + ".inst 0x4f89e83d // sdot v29.4s, v1.16b, v9.4b[2]\n" + ".inst 0x4fa9e83c // sdot v28.4s, v1.16b, v9.4b[3]\n" + "ldr q9, [x20, #0x50]\n" + ".inst 0x4f83e03b // sdot v27.4s, v1.16b, v3.4b[0]\n" + ".inst 0x4fa3e03a // sdot v26.4s, v1.16b, v3.4b[1]\n" + ".inst 0x4f83e839 // sdot v25.4s, v1.16b, v3.4b[2]\n" + ".inst 0x4fa3e838 // sdot v24.4s, v1.16b, v3.4b[3]\n" + "ldr q3, [x27, #0x60]\n" + ".inst 0x4f88e037 // sdot v23.4s, v1.16b, v8.4b[0]\n" + ".inst 0x4fa8e036 // sdot v22.4s, v1.16b, v8.4b[1]\n" + ".inst 0x4f88e835 // sdot v21.4s, v1.16b, v8.4b[2]\n" + ".inst 0x4fa8e834 // sdot v20.4s, v1.16b, v8.4b[3]\n" + "ldr q8, [x22, #0x60]\n" + ".inst 0x4f87e033 // sdot v19.4s, v1.16b, v7.4b[0]\n" + ".inst 0x4fa7e032 // sdot v18.4s, v1.16b, v7.4b[1]\n" + ".inst 0x4f87e831 // sdot v17.4s, v1.16b, v7.4b[2]\n" + ".inst 0x4fa7e830 // sdot v16.4s, v1.16b, v7.4b[3]\n" + "ldr q7, [x21, #0x60]\n" + "ldr q1, [x20, #0x60]\n" + ".inst 0x4f82e1bf // sdot v31.4s, v13.16b, v2.4b[0]\n" + ".inst 0x4fa2e1be // sdot v30.4s, v13.16b, v2.4b[1]\n" + ".inst 0x4f82e9bd // sdot v29.4s, v13.16b, v2.4b[2]\n" + ".inst 0x4fa2e9bc // sdot v28.4s, v13.16b, v2.4b[3]\n" + "ldr q2, [x27, #0x70]\n" + "add x27, x27, #0x80\n" + ".inst 0x4f80e1bb // sdot v27.4s, v13.16b, v0.4b[0]\n" + ".inst 0x4fa0e1ba // sdot v26.4s, v13.16b, v0.4b[1]\n" + ".inst 0x4f80e9b9 // sdot v25.4s, v13.16b, v0.4b[2]\n" + ".inst 0x4fa0e9b8 // sdot v24.4s, v13.16b, v0.4b[3]\n" + "ldr q0, [x22, #0x70]\n" + "add x22, x22, #0x80\n" + ".inst 0x4f84e1b7 // sdot v23.4s, v13.16b, v4.4b[0]\n" + ".inst 0x4fa4e1b6 // sdot v22.4s, v13.16b, v4.4b[1]\n" + ".inst 0x4f84e9b5 // sdot v21.4s, v13.16b, v4.4b[2]\n" + ".inst 0x4fa4e9b4 // sdot v20.4s, v13.16b, v4.4b[3]\n" + "ldr q4, [x21, #0x70]\n" + "add x21, x21, #0x80\n" + ".inst 0x4f8be1b3 // sdot v19.4s, v13.16b, v11.4b[0]\n" + ".inst 0x4fabe1b2 // sdot v18.4s, v13.16b, v11.4b[1]\n" + ".inst 0x4f8be9b1 // sdot v17.4s, v13.16b, v11.4b[2]\n" + ".inst 0x4fabe9b0 // sdot v16.4s, v13.16b, v11.4b[3]\n" + "ldr q11, [x20, #0x70]\n" + "add x20, x20, #0x80\n" + ".inst 0x4f8ee19f // sdot v31.4s, v12.16b, v14.4b[0]\n" + ".inst 0x4faee19e // sdot v30.4s, v12.16b, v14.4b[1]\n" + ".inst 0x4f8ee99d // sdot v29.4s, v12.16b, v14.4b[2]\n" + ".inst 0x4faee99c // sdot v28.4s, v12.16b, v14.4b[3]\n" + ".inst 0x4f86e19b // sdot v27.4s, v12.16b, v6.4b[0]\n" + ".inst 0x4fa6e19a // sdot v26.4s, v12.16b, v6.4b[1]\n" + ".inst 0x4f86e999 // sdot v25.4s, v12.16b, v6.4b[2]\n" + ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n" + ".inst 0x4f8ae197 // sdot v23.4s, v12.16b, v10.4b[0]\n" + ".inst 0x4faae196 // sdot v22.4s, v12.16b, v10.4b[1]\n" + ".inst 0x4f8ae995 // sdot v21.4s, v12.16b, v10.4b[2]\n" + ".inst 0x4faae994 // sdot v20.4s, v12.16b, v10.4b[3]\n" + ".inst 0x4f89e193 // sdot v19.4s, v12.16b, v9.4b[0]\n" + ".inst 0x4fa9e192 // sdot v18.4s, v12.16b, v9.4b[1]\n" + ".inst 0x4f89e991 // sdot v17.4s, v12.16b, v9.4b[2]\n" + ".inst 0x4fa9e990 // sdot v16.4s, v12.16b, v9.4b[3]\n" + ".inst 0x4f83e0bf // sdot v31.4s, v5.16b, v3.4b[0]\n" + ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n" + ".inst 0x4f83e8bd // sdot v29.4s, v5.16b, v3.4b[2]\n" + ".inst 0x4fa3e8bc // sdot v28.4s, v5.16b, v3.4b[3]\n" + ".inst 0x4f88e0bb // sdot v27.4s, v5.16b, v8.4b[0]\n" + ".inst 0x4fa8e0ba // sdot v26.4s, v5.16b, v8.4b[1]\n" + ".inst 0x4f88e8b9 // sdot v25.4s, v5.16b, v8.4b[2]\n" + ".inst 0x4fa8e8b8 // sdot v24.4s, v5.16b, v8.4b[3]\n" + ".inst 0x4f87e0b7 // sdot v23.4s, v5.16b, v7.4b[0]\n" + ".inst 0x4fa7e0b6 // sdot v22.4s, v5.16b, v7.4b[1]\n" + ".inst 0x4f87e8b5 // sdot v21.4s, v5.16b, v7.4b[2]\n" + ".inst 0x4fa7e8b4 // sdot v20.4s, v5.16b, v7.4b[3]\n" + ".inst 0x4f81e0b3 // sdot v19.4s, v5.16b, v1.4b[0]\n" + ".inst 0x4fa1e0b2 // sdot v18.4s, v5.16b, v1.4b[1]\n" + ".inst 0x4f81e8b1 // sdot v17.4s, v5.16b, v1.4b[2]\n" + ".inst 0x4fa1e8b0 // sdot v16.4s, v5.16b, v1.4b[3]\n" + ".inst 0x4f82e1ff // sdot v31.4s, v15.16b, v2.4b[0]\n" + ".inst 0x4fa2e1fe // sdot v30.4s, v15.16b, v2.4b[1]\n" + ".inst 0x4f82e9fd // sdot v29.4s, v15.16b, v2.4b[2]\n" + ".inst 0x4fa2e9fc // sdot v28.4s, v15.16b, v2.4b[3]\n" + ".inst 0x4f80e1fb // sdot v27.4s, v15.16b, v0.4b[0]\n" + ".inst 0x4fa0e1fa // sdot v26.4s, v15.16b, v0.4b[1]\n" + ".inst 0x4f80e9f9 // sdot v25.4s, v15.16b, v0.4b[2]\n" + ".inst 0x4fa0e9f8 // sdot v24.4s, v15.16b, v0.4b[3]\n" + ".inst 0x4f84e1f7 // sdot v23.4s, v15.16b, v4.4b[0]\n" + ".inst 0x4fa4e1f6 // sdot v22.4s, v15.16b, v4.4b[1]\n" + ".inst 0x4f84e9f5 // sdot v21.4s, v15.16b, v4.4b[2]\n" + ".inst 0x4fa4e9f4 // sdot v20.4s, v15.16b, v4.4b[3]\n" + ".inst 0x4f8be1f3 // sdot v19.4s, v15.16b, v11.4b[0]\n" + ".inst 0x4fabe1f2 // sdot v18.4s, v15.16b, v11.4b[1]\n" + ".inst 0x4f8be9f1 // sdot v17.4s, v15.16b, v11.4b[2]\n" + ".inst 0x4fabe9f0 // sdot v16.4s, v15.16b, v11.4b[3]\n" + "bgt 3b\n" + "ldr q5, [x11, #0x0]\n" + "ld1 { v1.4s }, [x27]\n" + "add x27, x27, #0x10\n" + "ldr q4, [x11, #0x10]\n" + "ldr q0, [x27, #0x0]\n" + "add x11, x11, #0x20\n" + "mla v31.4s, v5.4s, v1.s[0]\n" + "mla v30.4s, v5.4s, v1.s[1]\n" + "mla v29.4s, v5.4s, v1.s[2]\n" + "mla v28.4s, v5.4s, v1.s[3]\n" + "fmul v3.4s, v4.4s, v0.s[0]\n" + "fmul v2.4s, v4.4s, v0.s[1]\n" + "fmul v1.4s, v4.4s, v0.s[2]\n" + "scvtf v31.4s, v31.4s\n" + "fmul v0.4s, v4.4s, v0.s[3]\n" + "scvtf v30.4s, v30.4s\n" + "scvtf v29.4s, v29.4s\n" + "scvtf v28.4s, v28.4s\n" + "fmul v31.4s, v31.4s, v3.4s\n" + "fmul v30.4s, v30.4s, v2.4s\n" + "fmul v29.4s, v29.4s, v1.4s\n" + "fmul v28.4s, v28.4s, v0.4s\n" + "ld1 { v1.4s }, [x22]\n" + "add x22, x22, #0x10\n" + "ldr q0, [x22, #0x0]\n" + "mla v27.4s, v5.4s, v1.s[0]\n" + "mla v26.4s, v5.4s, v1.s[1]\n" + "mla v25.4s, v5.4s, v1.s[2]\n" + "mla v24.4s, v5.4s, v1.s[3]\n" + "fmul v3.4s, v4.4s, v0.s[0]\n" + "fmul v2.4s, v4.4s, v0.s[1]\n" + "fmul v1.4s, v4.4s, v0.s[2]\n" + "scvtf v27.4s, v27.4s\n" + "fmul v0.4s, v4.4s, v0.s[3]\n" + "scvtf v26.4s, v26.4s\n" + "scvtf v25.4s, v25.4s\n" + "scvtf v24.4s, v24.4s\n" + "fmul v27.4s, v27.4s, v3.4s\n" + "fmul v26.4s, v26.4s, v2.4s\n" + "fmul v25.4s, v25.4s, v1.4s\n" + "fmul v24.4s, v24.4s, v0.4s\n" + "ld1 { v1.4s }, [x21]\n" + "add x21, x21, #0x10\n" + "ldr q0, [x21, #0x0]\n" + "mla v23.4s, v5.4s, v1.s[0]\n" + "mla v22.4s, v5.4s, v1.s[1]\n" + "mla v21.4s, v5.4s, v1.s[2]\n" + "mla v20.4s, v5.4s, v1.s[3]\n" + "fmul v3.4s, v4.4s, v0.s[0]\n" + "fmul v2.4s, v4.4s, v0.s[1]\n" + "fmul v1.4s, v4.4s, v0.s[2]\n" + "scvtf v23.4s, v23.4s\n" + "fmul v0.4s, v4.4s, v0.s[3]\n" + "scvtf v22.4s, v22.4s\n" + "scvtf v21.4s, v21.4s\n" + "scvtf v20.4s, v20.4s\n" + "fmul v23.4s, v23.4s, v3.4s\n" + "fmul v22.4s, v22.4s, v2.4s\n" + "fmul v21.4s, v21.4s, v1.4s\n" + "fmul v20.4s, v20.4s, v0.4s\n" + "ld1 { v1.4s }, [x20]\n" + "add x20, x20, #0x10\n" + "ldr q0, [x20, #0x0]\n" + "mla v19.4s, v5.4s, v1.s[0]\n" + "mla v18.4s, v5.4s, v1.s[1]\n" + "mla v17.4s, v5.4s, v1.s[2]\n" + "mla v16.4s, v5.4s, v1.s[3]\n" + "fmul v3.4s, v4.4s, v0.s[0]\n" + "fmul v2.4s, v4.4s, v0.s[1]\n" + "fmul v1.4s, v4.4s, v0.s[2]\n" + "scvtf v19.4s, v19.4s\n" + "fmul v0.4s, v4.4s, v0.s[3]\n" + "scvtf v18.4s, v18.4s\n" + "scvtf v17.4s, v17.4s\n" + "scvtf v16.4s, v16.4s\n" + "fmul v19.4s, v19.4s, v3.4s\n" + "fmul v18.4s, v18.4s, v2.4s\n" + "fmul v17.4s, v17.4s, v1.4s\n" + "fmul v16.4s, v16.4s, v0.4s\n" + "ld1r { v1.4s }, [%x[clamp_vals]]\n" + "add x20, %x[clamp_vals], #0x4\n" + "cmp x10, #0x4\n" + "ld1r { v0.4s }, [x20]\n" + "add x11, x11, #0x10\n" + "fmax v31.4s, v31.4s, v1.4s\n" + "fmax v30.4s, v30.4s, v1.4s\n" + "fmax v29.4s, v29.4s, v1.4s\n" + "fmax v28.4s, v28.4s, v1.4s\n" + "fmax v27.4s, v27.4s, v1.4s\n" + "fmax v26.4s, v26.4s, v1.4s\n" + "fmax v25.4s, v25.4s, v1.4s\n" + "fmax v24.4s, v24.4s, v1.4s\n" + "fmax v23.4s, v23.4s, v1.4s\n" + "fmax v22.4s, v22.4s, v1.4s\n" + "fmax v21.4s, v21.4s, v1.4s\n" + "fmax v20.4s, v20.4s, v1.4s\n" + "fmax v19.4s, v19.4s, v1.4s\n" + "fmax v18.4s, v18.4s, v1.4s\n" + "fmax v17.4s, v17.4s, v1.4s\n" + "fmax v16.4s, v16.4s, v1.4s\n" + "fmin v31.4s, v31.4s, v0.4s\n" + "fmin v30.4s, v30.4s, v0.4s\n" + "fmin v29.4s, v29.4s, v0.4s\n" + "fmin v28.4s, v28.4s, v0.4s\n" + "fmin v27.4s, v27.4s, v0.4s\n" + "fmin v26.4s, v26.4s, v0.4s\n" + "fmin v25.4s, v25.4s, v0.4s\n" + "fmin v24.4s, v24.4s, v0.4s\n" + "fmin v23.4s, v23.4s, v0.4s\n" + "fmin v22.4s, v22.4s, v0.4s\n" + "fmin v21.4s, v21.4s, v0.4s\n" + "fmin v20.4s, v20.4s, v0.4s\n" + "fmin v19.4s, v19.4s, v0.4s\n" + "fmin v18.4s, v18.4s, v0.4s\n" + "fmin v17.4s, v17.4s, v0.4s\n" + "fmin v16.4s, v16.4s, v0.4s\n" + "blt 8f\n" + "mov x20, %x[dst]\n" + "str q31, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q30, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q29, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q28, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q27, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q26, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q25, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q24, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q23, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q22, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q21, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q20, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q19, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q18, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q17, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q16, [x20, #0x0]\n" + "b 13f\n" + "8:" // Partial output + "mov x28, %x[dst]\n" + "add x26, x28, %x[dst_stride_row], LSL #2\n" + "add x25, x26, %x[dst_stride_row], LSL #1\n" + "add x24, x26, %x[dst_stride_row]\n" + "add x23, x25, %x[dst_stride_row]\n" + "add x22, x28, %x[dst_stride_row], LSL #1\n" + "add x21, x28, %x[dst_stride_row]\n" + "add x20, x22, %x[dst_stride_row]\n" + "add x27, x23, %x[dst_stride_row]\n" + "tbz x10, #1, 9f\n" + "st1 { v24.d }[0], [x23], #0x8\n" + "st1 { v25.d }[0], [x25], #0x8\n" + "st1 { v26.d }[0], [x24], #0x8\n" + "st1 { v27.d }[0], [x26], #0x8\n" + "st1 { v28.d }[0], [x20], #0x8\n" + "st1 { v29.d }[0], [x22], #0x8\n" + "st1 { v30.d }[0], [x21], #0x8\n" + "st1 { v31.d }[0], [x28], #0x8\n" + "tbz x10, #0, 10f\n" + "st1 { v24.s }[2], [x23]\n" + "st1 { v25.s }[2], [x25]\n" + "st1 { v26.s }[2], [x24]\n" + "st1 { v27.s }[2], [x26]\n" + "st1 { v28.s }[2], [x20]\n" + "st1 { v29.s }[2], [x22]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v31.s }[2], [x28]\n" + "b 10f\n" + "9:" // Output block 0: partial_1_0 + "st1 { v24.s }[0], [x23]\n" + "st1 { v25.s }[0], [x25]\n" + "st1 { v26.s }[0], [x24]\n" + "st1 { v27.s }[0], [x26]\n" + "st1 { v28.s }[0], [x20]\n" + "st1 { v29.s }[0], [x22]\n" + "st1 { v30.s }[0], [x21]\n" + "st1 { v31.s }[0], [x28]\n" + "10:" // Output block 0: Done + "add x26, x27, %x[dst_stride_row], LSL #2\n" + "add x25, x27, %x[dst_stride_row], LSL #1\n" + "add x24, x26, %x[dst_stride_row], LSL #1\n" + "add x23, x27, %x[dst_stride_row]\n" + "add x22, x25, %x[dst_stride_row]\n" + "add x21, x26, %x[dst_stride_row]\n" + "add x20, x24, %x[dst_stride_row]\n" + "tbz x10, #1, 11f\n" + "st1 { v16.d }[0], [x20], #0x8\n" + "st1 { v17.d }[0], [x24], #0x8\n" + "st1 { v18.d }[0], [x21], #0x8\n" + "st1 { v19.d }[0], [x26], #0x8\n" + "st1 { v20.d }[0], [x22], #0x8\n" + "st1 { v21.d }[0], [x25], #0x8\n" + "st1 { v22.d }[0], [x23], #0x8\n" + "st1 { v23.d }[0], [x27], #0x8\n" + "tbz x10, #0, 12f\n" + "st1 { v16.s }[2], [x20]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v18.s }[2], [x21]\n" + "st1 { v19.s }[2], [x26]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v22.s }[2], [x23]\n" + "st1 { v23.s }[2], [x27]\n" + "b 12f\n" + "11:" // Output block 1: partial_1_0 + "st1 { v16.s }[0], [x20]\n" + "st1 { v17.s }[0], [x24]\n" + "st1 { v18.s }[0], [x21]\n" + "st1 { v19.s }[0], [x26]\n" + "st1 { v20.s }[0], [x22]\n" + "st1 { v21.s }[0], [x25]\n" + "st1 { v22.s }[0], [x23]\n" + "st1 { v23.s }[0], [x27]\n" + "12:" // Output block 1: Done + "13:" // Output stage exit + "subs x10, x10, #0x4\n" + "add %x[dst], %x[dst], #0x10\n" + "bgt 2b\n" + "mov x20, #0x4\n" + "sub x13, x13, #0x10\n" + "cmp x13, #0x10\n" + "mov %x[dst], x9\n" + "madd %x[lhs_packed], x20, x12, %x[lhs_packed]\n" + "bge 1b\n" + "14:" // Row loop skip + "cbz x13, 23f\n" + "15:" // Row tail: Row loop + "mov x26, %x[rhs_packed]\n" + "mov x25, %x[n]\n" + "add x24, %x[dst], %x[dst_stride_row], LSL #2\n" + "16:" // Row tail: Column loop + "mov x27, %x[lhs_packed]\n" + "movi v31.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "mov x20, %x[num_blocks]\n" + "movi v29.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "17:" // Row tail: Sub block loop + "ldr q4, [x26, #0x0]\n" + "ldr q3, [x27, #0x0]\n" + "movi v2.16b, #0xf0\n" + "subs x20, x20, #0x1\n" + "ldr q1, [x26, #0x10]\n" + "ldr q0, [x27, #0x10]\n" + "ldr q27, [x26, #0x20]\n" + "ldr q26, [x27, #0x20]\n" + "ldr q25, [x26, #0x30]\n" + "ldr q24, [x27, #0x30]\n" + "shl v23.16b, v4.16b, #0x4\n" + "and v4.16b, v4.16b, v2.16b\n" + "ldr q22, [x27, #0x40]\n" + "ldr q21, [x27, #0x50]\n" + "shl v20.16b, v1.16b, #0x4\n" + "and v1.16b, v1.16b, v2.16b\n" + "ldr q19, [x27, #0x60]\n" + "ldr q18, [x27, #0x70]\n" + "shl v17.16b, v27.16b, #0x4\n" + "and v27.16b, v27.16b, v2.16b\n" + ".inst 0x4f83e2ff // sdot v31.4s, v23.16b, v3.4b[0]\n" + ".inst 0x4fa3e2fe // sdot v30.4s, v23.16b, v3.4b[1]\n" + "shl v16.16b, v25.16b, #0x4\n" + "add x26, x26, #0x40\n" + ".inst 0x4f83eafd // sdot v29.4s, v23.16b, v3.4b[2]\n" + ".inst 0x4fa3eafc // sdot v28.4s, v23.16b, v3.4b[3]\n" + "and v25.16b, v25.16b, v2.16b\n" + "add x27, x27, #0x80\n" + ".inst 0x4f80e29f // sdot v31.4s, v20.16b, v0.4b[0]\n" + ".inst 0x4fa0e29e // sdot v30.4s, v20.16b, v0.4b[1]\n" + ".inst 0x4f80ea9d // sdot v29.4s, v20.16b, v0.4b[2]\n" + ".inst 0x4fa0ea9c // sdot v28.4s, v20.16b, v0.4b[3]\n" + ".inst 0x4f9ae23f // sdot v31.4s, v17.16b, v26.4b[0]\n" + ".inst 0x4fbae23e // sdot v30.4s, v17.16b, v26.4b[1]\n" + ".inst 0x4f9aea3d // sdot v29.4s, v17.16b, v26.4b[2]\n" + ".inst 0x4fbaea3c // sdot v28.4s, v17.16b, v26.4b[3]\n" + ".inst 0x4f98e21f // sdot v31.4s, v16.16b, v24.4b[0]\n" + ".inst 0x4fb8e21e // sdot v30.4s, v16.16b, v24.4b[1]\n" + ".inst 0x4f98ea1d // sdot v29.4s, v16.16b, v24.4b[2]\n" + ".inst 0x4fb8ea1c // sdot v28.4s, v16.16b, v24.4b[3]\n" + ".inst 0x4f96e09f // sdot v31.4s, v4.16b, v22.4b[0]\n" + ".inst 0x4fb6e09e // sdot v30.4s, v4.16b, v22.4b[1]\n" + ".inst 0x4f96e89d // sdot v29.4s, v4.16b, v22.4b[2]\n" + ".inst 0x4fb6e89c // sdot v28.4s, v4.16b, v22.4b[3]\n" + ".inst 0x4f95e03f // sdot v31.4s, v1.16b, v21.4b[0]\n" + ".inst 0x4fb5e03e // sdot v30.4s, v1.16b, v21.4b[1]\n" + ".inst 0x4f95e83d // sdot v29.4s, v1.16b, v21.4b[2]\n" + ".inst 0x4fb5e83c // sdot v28.4s, v1.16b, v21.4b[3]\n" + ".inst 0x4f93e37f // sdot v31.4s, v27.16b, v19.4b[0]\n" + ".inst 0x4fb3e37e // sdot v30.4s, v27.16b, v19.4b[1]\n" + ".inst 0x4f93eb7d // sdot v29.4s, v27.16b, v19.4b[2]\n" + ".inst 0x4fb3eb7c // sdot v28.4s, v27.16b, v19.4b[3]\n" + ".inst 0x4f92e33f // sdot v31.4s, v25.16b, v18.4b[0]\n" + ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n" + ".inst 0x4f92eb3d // sdot v29.4s, v25.16b, v18.4b[2]\n" + ".inst 0x4fb2eb3c // sdot v28.4s, v25.16b, v18.4b[3]\n" + "bgt 17b\n" + "ldr q18, [x26, #0x0]\n" + "ld1 { v17.4s }, [x27]\n" + "add x27, x27, #0x10\n" + "ldr q20, [x26, #0x10]\n" + "ldr q16, [x27, #0x0]\n" + "add x26, x26, #0x20\n" + "mla v31.4s, v18.4s, v17.s[0]\n" + "mla v30.4s, v18.4s, v17.s[1]\n" + "mla v29.4s, v18.4s, v17.s[2]\n" + "mla v28.4s, v18.4s, v17.s[3]\n" + "fmul v19.4s, v20.4s, v16.s[0]\n" + "fmul v18.4s, v20.4s, v16.s[1]\n" + "fmul v17.4s, v20.4s, v16.s[2]\n" + "scvtf v31.4s, v31.4s\n" + "fmul v16.4s, v20.4s, v16.s[3]\n" + "scvtf v30.4s, v30.4s\n" + "scvtf v29.4s, v29.4s\n" + "scvtf v28.4s, v28.4s\n" + "fmul v31.4s, v31.4s, v19.4s\n" + "fmul v30.4s, v30.4s, v18.4s\n" + "fmul v29.4s, v29.4s, v17.4s\n" + "fmul v28.4s, v28.4s, v16.4s\n" + "ld1r { v17.4s }, [%x[clamp_vals]]\n" + "add x20, %x[clamp_vals], #0x4\n" + "cmp x25, #0x4\n" + "ld1r { v16.4s }, [x20]\n" + "add x26, x26, #0x10\n" + "fmax v31.4s, v31.4s, v17.4s\n" + "fmax v30.4s, v30.4s, v17.4s\n" + "fmax v29.4s, v29.4s, v17.4s\n" + "fmax v28.4s, v28.4s, v17.4s\n" + "fmin v31.4s, v31.4s, v16.4s\n" + "fmin v30.4s, v30.4s, v16.4s\n" + "fmin v29.4s, v29.4s, v16.4s\n" + "fmin v28.4s, v28.4s, v16.4s\n" + "blt 19f\n" + "mov x20, %x[dst]\n" + "cmp x13, #0x1\n" + "str q31, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "cmp x13, #0x2\n" + "str q30, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "cmp x13, #0x3\n" + "str q29, [x20, #0x0]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "str q28, [x20, #0x0]\n" + "b 22f\n" + "19:" // Row tail: Partial output + "mov x23, %x[dst]\n" + "cmp x13, #0x1\n" + "add x22, x23, %x[dst_stride_row]\n" + "csel x22, x22, x23, GT\n" + "cmp x13, #0x2\n" + "add x21, x23, %x[dst_stride_row], LSL #1\n" + "csel x21, x21, x22, GT\n" + "cmp x13, #0x3\n" + "add x20, x21, %x[dst_stride_row]\n" + "csel x20, x20, x21, GT\n" + "tbz x25, #1, 20f\n" + "st1 { v28.d }[0], [x20], #0x8\n" + "st1 { v29.d }[0], [x21], #0x8\n" + "st1 { v30.d }[0], [x22], #0x8\n" + "st1 { v31.d }[0], [x23], #0x8\n" + "tbz x25, #0, 21f\n" + "st1 { v28.s }[2], [x20]\n" + "st1 { v29.s }[2], [x21]\n" + "st1 { v30.s }[2], [x22]\n" + "st1 { v31.s }[2], [x23]\n" + "b 21f\n" + "20:" // Row tail: Output block 0: partial_1_0 + "st1 { v28.s }[0], [x20]\n" + "st1 { v29.s }[0], [x21]\n" + "st1 { v30.s }[0], [x22]\n" + "st1 { v31.s }[0], [x23]\n" + "21:" // Row tail: Output block 0: Done + "22:" // Row tail: Output stage exit + "subs x25, x25, #0x4\n" + "add %x[dst], %x[dst], #0x10\n" + "bgt 16b\n" + "subs x13, x13, #0x4\n" + "add %x[lhs_packed], %x[lhs_packed], x12\n" + "mov %x[dst], x24\n" + "bgt 15b\n" + "23:" // Row tail: Row loop skip + : [dst] "+&r"(dst), [lhs_packed] "+&r"(lhs_packed) + : [clamp_vals] "r"(clamp_vals), [dst_stride_row] "r"(dst_stride_row), [m] "r"(m), [n] "r"(n), + [num_blocks] "r"(num_blocks), [rhs_packed] "r"(rhs_packed) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", + "x28"); +} + +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h new file mode 100644 index 00000000..8b009bb0 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h @@ -0,0 +1,137 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the mr value, which must be used to pack the LHS matrix +/// +/// @return the mr value +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); + +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed Signed 8-bit quantized asymmetric per-row (qai8dxp) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 16 +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m_idx, size_t k); + +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed Signed 4-bit quantized symmetric per-channel (qsi4cxp) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( + size_t n_idx, // + size_t k); + +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 16. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( + size_t m_idx, // + size_t n_idx, // + size_t dst_stride); + +/// Gets the size in bytes for the destination (DST) matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination (DST) matrix size in bytes +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m, size_t n); + +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dxp) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cxp) and packed. +/// Output tile: (rows x cols) = 16 x 8 +/// Accumulation performed in a single for loop: 32 +/// Extension used: dotprod +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension between the LHS and RHS matrix. +/// @param[in] lhs_packed The LHS packed matrix. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref +/// kai_run_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 +/// OR kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst The DST matrix. +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. It must be sizeof(float). +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( + size_t m, size_t n, size_t k, + + const void* lhs_packed, // + const void* rhs_packed, // + float* dst, // + size_t dst_stride_row, // + size_t dst_stride_col, // + float scalar_min, // + float scalar_max); // + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c new file mode 100644 index 00000000..69e58159 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c @@ -0,0 +1,815 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#if !defined(__ARM_FEATURE_DOTPROD) +#error "Dotprod extension required to compile this micro-kernel" +#else // Architectural features check. +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" + +#include +#include +#include + +#include "kai/kai_common.h" + +static const size_t kai_m_step = 8; +static const size_t kai_n_step = 8; +static const size_t kai_mr = 1; +static const size_t kai_nr = 8; +static const size_t kai_kr = 16; +static const size_t kai_sr = 2; +static const size_t kai_num_bytes_multiplier_lhs = sizeof(float); +static const size_t kai_num_bytes_multiplier_rhs = sizeof(float); +static const size_t kai_num_bytes_offset_lhs = sizeof(int32_t); +static const size_t kai_num_bytes_sum_rhs = sizeof(int32_t); +static const size_t kai_num_bytes_bias = sizeof(float); + +inline static size_t kai_k_roundedup(size_t k) { + // Since we pack a float and int32 value at the end of the row, + // we must make sure that k is a multiple of 4 for alignment + size_t kr_sr_roundedup4 = kai_roundup(kai_kr * kai_sr, 4); + return kai_roundup(k, kr_sr_roundedup4); +} + +inline static size_t kai_lhs_packed_stride(size_t k) { + const size_t k_internal = kai_k_roundedup(k); + + KAI_ASSERT((k_internal % 2) == 0); + + return kai_mr * (k_internal * sizeof(int8_t) + kai_num_bytes_multiplier_lhs + kai_num_bytes_offset_lhs); +} + +inline static size_t kai_rhs_packed_stride(size_t k) { + const size_t k_internal = kai_k_roundedup(k); + + KAI_ASSERT((k_internal % 2) == 0); + + return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias); +} + +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_m_step; +} + +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_n_step; +} + +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_mr; +} + +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_nr; +} + +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_kr; +} + +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { + return kai_sr; +} + +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m_idx, size_t k) { + KAI_ASSERT((m_idx % kai_m_step) == 0); + + return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); +} + +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t n_idx, size_t k) { + KAI_ASSERT((n_idx % kai_n_step) == 0); + + return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); +} + +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( + size_t m_idx, size_t n_idx, size_t dst_stride) { + KAI_ASSERT((m_idx % kai_m_step) == 0); + KAI_ASSERT((n_idx % kai_n_step) == 0); + + return (n_idx * sizeof(float)) + m_idx * dst_stride; +} + +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m, size_t n) { + return m * n * sizeof(float); +} + +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( + size_t m, size_t n, size_t k, const void* restrict lhs_packed, const void* restrict rhs_packed, float* restrict dst, + size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) { + KAI_ASSERT(dst_stride_col == sizeof(float)); + + if (m == 0) { + return; + } + + const size_t k_internal = kai_k_roundedup(k); + + size_t num_blocks = k_internal / 32; + + float clamp_vals[2] = {scalar_min, scalar_max}; + __asm__ __volatile__( + "mov x12, %x[m]\n" + "mov x11, #0x80\n" + "movi v16.16b, #0xf0\n" + "mov x20, #0x20\n" + "cmp x12, #0x8\n" + "madd x11, %x[num_blocks], x11, x20\n" + "blt 12f\n" + "1:" // Row loop + "mov x10, %x[rhs_packed]\n" + "mov x9, %x[n]\n" + "add x28, %x[dst], %x[dst_stride_row], LSL #3\n" + "2:" // Column loop + "mov x22, %x[lhs_packed]\n" + "movi v13.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "mov x21, %x[num_blocks]\n" + "movi v12.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "add x20, x22, x11\n" + "movi v6.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v3.4s, #0x0\n" + "movi v4.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v2.4s, #0x0\n" + "3:" // Sub block loop + "ldr q8, [x10, #0x0]\n" + "ldr q19, [x10, #0x10]\n" + "subs x21, x21, #0x1\n" + "ldr q26, [x22, #0x0]\n" + "ldr q1, [x20, #0x0]\n" + "ldr q5, [x10, #0x20]\n" + "ldr q17, [x10, #0x30]\n" + "ldr q29, [x22, #0x10]\n" + "ldr q28, [x20, #0x10]\n" + "shl v24.16b, v8.16b, #0x4\n" + "shl v27.16b, v19.16b, #0x4\n" + "ldr q18, [x10, #0x40]\n" + "ldr q20, [x10, #0x50]\n" + "and v8.16b, v8.16b, v16.16b\n" + "and v19.16b, v19.16b, v16.16b\n" + "ldr q21, [x22, #0x20]\n" + "ldr q7, [x20, #0x20]\n" + "shl v0.16b, v5.16b, #0x4\n" + "and v5.16b, v5.16b, v16.16b\n" + ".inst 0x4f9ae30d // sdot v13.4s, v24.16b, v26.4b[0]\n" + ".inst 0x4f9ae376 // sdot v22.4s, v27.16b, v26.4b[0]\n" + ".inst 0x4fbae30c // sdot v12.4s, v24.16b, v26.4b[1]\n" + ".inst 0x4fbae36f // sdot v15.4s, v27.16b, v26.4b[1]\n" + ".inst 0x4f9aeb0b // sdot v11.4s, v24.16b, v26.4b[2]\n" + ".inst 0x4f9aeb6e // sdot v14.4s, v27.16b, v26.4b[2]\n" + ".inst 0x4fbaeb06 // sdot v6.4s, v24.16b, v26.4b[3]\n" + ".inst 0x4fbaeb7f // sdot v31.4s, v27.16b, v26.4b[3]\n" + "ldr q26, [x10, #0x60]\n" + ".inst 0x4f81e319 // sdot v25.4s, v24.16b, v1.4b[0]\n" + ".inst 0x4f81e363 // sdot v3.4s, v27.16b, v1.4b[0]\n" + ".inst 0x4fa1e304 // sdot v4.4s, v24.16b, v1.4b[1]\n" + ".inst 0x4fa1e377 // sdot v23.4s, v27.16b, v1.4b[1]\n" + ".inst 0x4f81eb09 // sdot v9.4s, v24.16b, v1.4b[2]\n" + ".inst 0x4f81eb6a // sdot v10.4s, v27.16b, v1.4b[2]\n" + ".inst 0x4fa1eb1e // sdot v30.4s, v24.16b, v1.4b[3]\n" + "ldr q24, [x10, #0x70]\n" + ".inst 0x4fa1eb62 // sdot v2.4s, v27.16b, v1.4b[3]\n" + "ldr q1, [x22, #0x30]\n" + "ldr q27, [x20, #0x30]\n" + ".inst 0x4f9de00d // sdot v13.4s, v0.16b, v29.4b[0]\n" + ".inst 0x4fbde00c // sdot v12.4s, v0.16b, v29.4b[1]\n" + "add x10, x10, #0x80\n" + ".inst 0x4f9de80b // sdot v11.4s, v0.16b, v29.4b[2]\n" + ".inst 0x4fbde806 // sdot v6.4s, v0.16b, v29.4b[3]\n" + ".inst 0x4f9ce019 // sdot v25.4s, v0.16b, v28.4b[0]\n" + ".inst 0x4fbce004 // sdot v4.4s, v0.16b, v28.4b[1]\n" + ".inst 0x4f9ce809 // sdot v9.4s, v0.16b, v28.4b[2]\n" + ".inst 0x4fbce81e // sdot v30.4s, v0.16b, v28.4b[3]\n" + "shl v0.16b, v17.16b, #0x4\n" + "and v17.16b, v17.16b, v16.16b\n" + ".inst 0x4f9de016 // sdot v22.4s, v0.16b, v29.4b[0]\n" + ".inst 0x4fbde00f // sdot v15.4s, v0.16b, v29.4b[1]\n" + ".inst 0x4f9de80e // sdot v14.4s, v0.16b, v29.4b[2]\n" + ".inst 0x4fbde81f // sdot v31.4s, v0.16b, v29.4b[3]\n" + "ldr q29, [x22, #0x40]\n" + ".inst 0x4f9ce003 // sdot v3.4s, v0.16b, v28.4b[0]\n" + ".inst 0x4fbce017 // sdot v23.4s, v0.16b, v28.4b[1]\n" + ".inst 0x4f9ce80a // sdot v10.4s, v0.16b, v28.4b[2]\n" + ".inst 0x4fbce802 // sdot v2.4s, v0.16b, v28.4b[3]\n" + "ldr q0, [x20, #0x40]\n" + "shl v28.16b, v18.16b, #0x4\n" + "and v18.16b, v18.16b, v16.16b\n" + ".inst 0x4f95e38d // sdot v13.4s, v28.16b, v21.4b[0]\n" + ".inst 0x4fb5e38c // sdot v12.4s, v28.16b, v21.4b[1]\n" + ".inst 0x4f95eb8b // sdot v11.4s, v28.16b, v21.4b[2]\n" + ".inst 0x4fb5eb86 // sdot v6.4s, v28.16b, v21.4b[3]\n" + ".inst 0x4f87e399 // sdot v25.4s, v28.16b, v7.4b[0]\n" + ".inst 0x4fa7e384 // sdot v4.4s, v28.16b, v7.4b[1]\n" + ".inst 0x4f87eb89 // sdot v9.4s, v28.16b, v7.4b[2]\n" + ".inst 0x4fa7eb9e // sdot v30.4s, v28.16b, v7.4b[3]\n" + "shl v28.16b, v20.16b, #0x4\n" + "and v20.16b, v20.16b, v16.16b\n" + ".inst 0x4f95e396 // sdot v22.4s, v28.16b, v21.4b[0]\n" + ".inst 0x4fb5e38f // sdot v15.4s, v28.16b, v21.4b[1]\n" + ".inst 0x4f95eb8e // sdot v14.4s, v28.16b, v21.4b[2]\n" + ".inst 0x4fb5eb9f // sdot v31.4s, v28.16b, v21.4b[3]\n" + "ldr q21, [x22, #0x50]\n" + ".inst 0x4f87e383 // sdot v3.4s, v28.16b, v7.4b[0]\n" + ".inst 0x4fa7e397 // sdot v23.4s, v28.16b, v7.4b[1]\n" + ".inst 0x4f87eb8a // sdot v10.4s, v28.16b, v7.4b[2]\n" + ".inst 0x4fa7eb82 // sdot v2.4s, v28.16b, v7.4b[3]\n" + "ldr q28, [x20, #0x50]\n" + "shl v7.16b, v26.16b, #0x4\n" + "and v26.16b, v26.16b, v16.16b\n" + ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4fa1e0ec // sdot v12.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4f81e8eb // sdot v11.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4fa1e8e6 // sdot v6.4s, v7.16b, v1.4b[3]\n" + ".inst 0x4f9be0f9 // sdot v25.4s, v7.16b, v27.4b[0]\n" + ".inst 0x4fbbe0e4 // sdot v4.4s, v7.16b, v27.4b[1]\n" + ".inst 0x4f9be8e9 // sdot v9.4s, v7.16b, v27.4b[2]\n" + ".inst 0x4fbbe8fe // sdot v30.4s, v7.16b, v27.4b[3]\n" + "ldr q7, [x22, #0x60]\n" + ".inst 0x4f9de10d // sdot v13.4s, v8.16b, v29.4b[0]\n" + ".inst 0x4fbde10c // sdot v12.4s, v8.16b, v29.4b[1]\n" + ".inst 0x4f9de90b // sdot v11.4s, v8.16b, v29.4b[2]\n" + ".inst 0x4fbde906 // sdot v6.4s, v8.16b, v29.4b[3]\n" + ".inst 0x4f80e119 // sdot v25.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4fa0e104 // sdot v4.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4f80e909 // sdot v9.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0e91e // sdot v30.4s, v8.16b, v0.4b[3]\n" + "ldr q8, [x20, #0x60]\n" + ".inst 0x4f95e0ad // sdot v13.4s, v5.16b, v21.4b[0]\n" + ".inst 0x4fb5e0ac // sdot v12.4s, v5.16b, v21.4b[1]\n" + ".inst 0x4f95e8ab // sdot v11.4s, v5.16b, v21.4b[2]\n" + ".inst 0x4fb5e8a6 // sdot v6.4s, v5.16b, v21.4b[3]\n" + ".inst 0x4f9ce0b9 // sdot v25.4s, v5.16b, v28.4b[0]\n" + ".inst 0x4fbce0a4 // sdot v4.4s, v5.16b, v28.4b[1]\n" + ".inst 0x4f9ce8a9 // sdot v9.4s, v5.16b, v28.4b[2]\n" + ".inst 0x4fbce8be // sdot v30.4s, v5.16b, v28.4b[3]\n" + "ldr q5, [x22, #0x70]\n" + "add x22, x22, #0x80\n" + ".inst 0x4f87e24d // sdot v13.4s, v18.16b, v7.4b[0]\n" + ".inst 0x4fa7e24c // sdot v12.4s, v18.16b, v7.4b[1]\n" + ".inst 0x4f87ea4b // sdot v11.4s, v18.16b, v7.4b[2]\n" + ".inst 0x4fa7ea46 // sdot v6.4s, v18.16b, v7.4b[3]\n" + ".inst 0x4f88e259 // sdot v25.4s, v18.16b, v8.4b[0]\n" + ".inst 0x4fa8e244 // sdot v4.4s, v18.16b, v8.4b[1]\n" + ".inst 0x4f88ea49 // sdot v9.4s, v18.16b, v8.4b[2]\n" + ".inst 0x4fa8ea5e // sdot v30.4s, v18.16b, v8.4b[3]\n" + "ldr q18, [x20, #0x70]\n" + "add x20, x20, #0x80\n" + ".inst 0x4f85e34d // sdot v13.4s, v26.16b, v5.4b[0]\n" + ".inst 0x4fa5e34c // sdot v12.4s, v26.16b, v5.4b[1]\n" + ".inst 0x4f85eb4b // sdot v11.4s, v26.16b, v5.4b[2]\n" + ".inst 0x4fa5eb46 // sdot v6.4s, v26.16b, v5.4b[3]\n" + ".inst 0x4f92e359 // sdot v25.4s, v26.16b, v18.4b[0]\n" + ".inst 0x4fb2e344 // sdot v4.4s, v26.16b, v18.4b[1]\n" + ".inst 0x4f92eb49 // sdot v9.4s, v26.16b, v18.4b[2]\n" + ".inst 0x4fb2eb5e // sdot v30.4s, v26.16b, v18.4b[3]\n" + "shl v26.16b, v24.16b, #0x4\n" + "and v24.16b, v24.16b, v16.16b\n" + ".inst 0x4f81e356 // sdot v22.4s, v26.16b, v1.4b[0]\n" + ".inst 0x4fa1e34f // sdot v15.4s, v26.16b, v1.4b[1]\n" + ".inst 0x4f81eb4e // sdot v14.4s, v26.16b, v1.4b[2]\n" + ".inst 0x4fa1eb5f // sdot v31.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4f9be343 // sdot v3.4s, v26.16b, v27.4b[0]\n" + ".inst 0x4fbbe357 // sdot v23.4s, v26.16b, v27.4b[1]\n" + ".inst 0x4f9beb4a // sdot v10.4s, v26.16b, v27.4b[2]\n" + ".inst 0x4fbbeb42 // sdot v2.4s, v26.16b, v27.4b[3]\n" + ".inst 0x4f9de276 // sdot v22.4s, v19.16b, v29.4b[0]\n" + ".inst 0x4fbde26f // sdot v15.4s, v19.16b, v29.4b[1]\n" + ".inst 0x4f9dea6e // sdot v14.4s, v19.16b, v29.4b[2]\n" + ".inst 0x4fbdea7f // sdot v31.4s, v19.16b, v29.4b[3]\n" + ".inst 0x4f80e263 // sdot v3.4s, v19.16b, v0.4b[0]\n" + ".inst 0x4fa0e277 // sdot v23.4s, v19.16b, v0.4b[1]\n" + ".inst 0x4f80ea6a // sdot v10.4s, v19.16b, v0.4b[2]\n" + ".inst 0x4fa0ea62 // sdot v2.4s, v19.16b, v0.4b[3]\n" + ".inst 0x4f95e236 // sdot v22.4s, v17.16b, v21.4b[0]\n" + ".inst 0x4fb5e22f // sdot v15.4s, v17.16b, v21.4b[1]\n" + ".inst 0x4f95ea2e // sdot v14.4s, v17.16b, v21.4b[2]\n" + ".inst 0x4fb5ea3f // sdot v31.4s, v17.16b, v21.4b[3]\n" + ".inst 0x4f9ce223 // sdot v3.4s, v17.16b, v28.4b[0]\n" + ".inst 0x4fbce237 // sdot v23.4s, v17.16b, v28.4b[1]\n" + ".inst 0x4f9cea2a // sdot v10.4s, v17.16b, v28.4b[2]\n" + ".inst 0x4fbcea22 // sdot v2.4s, v17.16b, v28.4b[3]\n" + ".inst 0x4f87e296 // sdot v22.4s, v20.16b, v7.4b[0]\n" + ".inst 0x4fa7e28f // sdot v15.4s, v20.16b, v7.4b[1]\n" + ".inst 0x4f87ea8e // sdot v14.4s, v20.16b, v7.4b[2]\n" + ".inst 0x4fa7ea9f // sdot v31.4s, v20.16b, v7.4b[3]\n" + ".inst 0x4f88e283 // sdot v3.4s, v20.16b, v8.4b[0]\n" + ".inst 0x4fa8e297 // sdot v23.4s, v20.16b, v8.4b[1]\n" + ".inst 0x4f88ea8a // sdot v10.4s, v20.16b, v8.4b[2]\n" + ".inst 0x4fa8ea82 // sdot v2.4s, v20.16b, v8.4b[3]\n" + ".inst 0x4f85e316 // sdot v22.4s, v24.16b, v5.4b[0]\n" + ".inst 0x4fa5e30f // sdot v15.4s, v24.16b, v5.4b[1]\n" + ".inst 0x4f85eb0e // sdot v14.4s, v24.16b, v5.4b[2]\n" + ".inst 0x4fa5eb1f // sdot v31.4s, v24.16b, v5.4b[3]\n" + ".inst 0x4f92e303 // sdot v3.4s, v24.16b, v18.4b[0]\n" + ".inst 0x4fb2e317 // sdot v23.4s, v24.16b, v18.4b[1]\n" + ".inst 0x4f92eb0a // sdot v10.4s, v24.16b, v18.4b[2]\n" + ".inst 0x4fb2eb02 // sdot v2.4s, v24.16b, v18.4b[3]\n" + "bgt 3b\n" + "ldr q28, [x10, #0x0]\n" + "ldr q26, [x10, #0x10]\n" + "ld1 { v18.4s }, [x22]\n" + "ldr q29, [x10, #0x20]\n" + "add x22, x22, #0x10\n" + "ldr q27, [x10, #0x30]\n" + "ldr q5, [x22, #0x0]\n" + "add x10, x10, #0x40\n" + "mla v13.4s, v28.4s, v18.s[0]\n" + "mla v22.4s, v26.4s, v18.s[0]\n" + "mla v12.4s, v28.4s, v18.s[1]\n" + "mla v15.4s, v26.4s, v18.s[1]\n" + "mla v11.4s, v28.4s, v18.s[2]\n" + "mla v14.4s, v26.4s, v18.s[2]\n" + "mla v6.4s, v28.4s, v18.s[3]\n" + "fmul v17.4s, v29.4s, v5.s[0]\n" + "mla v31.4s, v26.4s, v18.s[3]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v24.4s, v27.4s, v5.s[0]\n" + "scvtf v22.4s, v22.4s\n" + "fmul v21.4s, v29.4s, v5.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v20.4s, v27.4s, v5.s[1]\n" + "scvtf v15.4s, v15.4s\n" + "fmul v19.4s, v29.4s, v5.s[2]\n" + "scvtf v11.4s, v11.4s\n" + "fmul v18.4s, v27.4s, v5.s[2]\n" + "scvtf v14.4s, v14.4s\n" + "fmul v1.4s, v29.4s, v5.s[3]\n" + "scvtf v6.4s, v6.4s\n" + "fmul v8.4s, v27.4s, v5.s[3]\n" + "scvtf v31.4s, v31.4s\n" + "fmul v13.4s, v13.4s, v17.4s\n" + "fmul v22.4s, v22.4s, v24.4s\n" + "fmul v12.4s, v12.4s, v21.4s\n" + "fmul v15.4s, v15.4s, v20.4s\n" + "fmul v11.4s, v11.4s, v19.4s\n" + "fmul v14.4s, v14.4s, v18.4s\n" + "fmul v6.4s, v6.4s, v1.4s\n" + "fmul v31.4s, v31.4s, v8.4s\n" + "ld1 { v1.4s }, [x20]\n" + "add x20, x20, #0x10\n" + "ldr q24, [x20, #0x0]\n" + "mla v25.4s, v28.4s, v1.s[0]\n" + "mla v3.4s, v26.4s, v1.s[0]\n" + "mla v4.4s, v28.4s, v1.s[1]\n" + "mla v23.4s, v26.4s, v1.s[1]\n" + "mla v9.4s, v28.4s, v1.s[2]\n" + "mla v10.4s, v26.4s, v1.s[2]\n" + "mla v30.4s, v28.4s, v1.s[3]\n" + "fmul v28.4s, v29.4s, v24.s[0]\n" + "mla v2.4s, v26.4s, v1.s[3]\n" + "scvtf v25.4s, v25.4s\n" + "fmul v5.4s, v27.4s, v24.s[0]\n" + "scvtf v3.4s, v3.4s\n" + "fmul v26.4s, v29.4s, v24.s[1]\n" + "scvtf v4.4s, v4.4s\n" + "fmul v8.4s, v27.4s, v24.s[1]\n" + "scvtf v23.4s, v23.4s\n" + "fmul v21.4s, v29.4s, v24.s[2]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v20.4s, v27.4s, v24.s[2]\n" + "scvtf v10.4s, v10.4s\n" + "fmul v19.4s, v29.4s, v24.s[3]\n" + "scvtf v30.4s, v30.4s\n" + "fmul v18.4s, v27.4s, v24.s[3]\n" + "scvtf v2.4s, v2.4s\n" + "fmul v25.4s, v25.4s, v28.4s\n" + "fmul v3.4s, v3.4s, v5.4s\n" + "fmul v4.4s, v4.4s, v26.4s\n" + "fmul v23.4s, v23.4s, v8.4s\n" + "fmul v9.4s, v9.4s, v21.4s\n" + "fmul v10.4s, v10.4s, v20.4s\n" + "fmul v30.4s, v30.4s, v19.4s\n" + "fmul v2.4s, v2.4s, v18.4s\n" + "ld1r { v19.4s }, [%x[clamp_vals]]\n" + "add x20, %x[clamp_vals], #0x4\n" + "cmp x9, #0x8\n" + "ld1r { v18.4s }, [x20]\n" + "add x10, x10, #0x20\n" + "fmax v13.4s, v13.4s, v19.4s\n" + "fmax v22.4s, v22.4s, v19.4s\n" + "fmax v12.4s, v12.4s, v19.4s\n" + "fmax v15.4s, v15.4s, v19.4s\n" + "fmax v11.4s, v11.4s, v19.4s\n" + "fmax v14.4s, v14.4s, v19.4s\n" + "fmax v6.4s, v6.4s, v19.4s\n" + "fmax v31.4s, v31.4s, v19.4s\n" + "fmax v25.4s, v25.4s, v19.4s\n" + "fmax v3.4s, v3.4s, v19.4s\n" + "fmax v4.4s, v4.4s, v19.4s\n" + "fmax v23.4s, v23.4s, v19.4s\n" + "fmax v9.4s, v9.4s, v19.4s\n" + "fmax v10.4s, v10.4s, v19.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" + "fmax v2.4s, v2.4s, v19.4s\n" + "fmin v13.4s, v13.4s, v18.4s\n" + "fmin v22.4s, v22.4s, v18.4s\n" + "fmin v12.4s, v12.4s, v18.4s\n" + "fmin v15.4s, v15.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmin v14.4s, v14.4s, v18.4s\n" + "fmin v6.4s, v6.4s, v18.4s\n" + "fmin v31.4s, v31.4s, v18.4s\n" + "fmin v25.4s, v25.4s, v18.4s\n" + "fmin v3.4s, v3.4s, v18.4s\n" + "fmin v4.4s, v4.4s, v18.4s\n" + "fmin v23.4s, v23.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v30.4s, v30.4s, v18.4s\n" + "fmin v2.4s, v2.4s, v18.4s\n" + "blt 6f\n" + "mov x20, %x[dst]\n" + "str q13, [x20, #0x0]\n" + "str q22, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q12, [x20, #0x0]\n" + "str q15, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q11, [x20, #0x0]\n" + "str q14, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q6, [x20, #0x0]\n" + "str q31, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q25, [x20, #0x0]\n" + "str q3, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q4, [x20, #0x0]\n" + "str q23, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q9, [x20, #0x0]\n" + "str q10, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q30, [x20, #0x0]\n" + "str q2, [x20, #0x10]\n" + "b 11f\n" + "6:" // Partial output + "mov x27, %x[dst]\n" + "add x26, x27, %x[dst_stride_row], LSL #2\n" + "add x25, x26, %x[dst_stride_row], LSL #1\n" + "add x24, x26, %x[dst_stride_row]\n" + "add x23, x25, %x[dst_stride_row]\n" + "add x22, x27, %x[dst_stride_row], LSL #1\n" + "add x21, x27, %x[dst_stride_row]\n" + "add x20, x22, %x[dst_stride_row]\n" + "tbz x9, #2, 8f\n" + "st1 { v30.4s }, [x23], #0x10\n" + "st1 { v9.4s }, [x25], #0x10\n" + "st1 { v4.4s }, [x24], #0x10\n" + "st1 { v25.4s }, [x26], #0x10\n" + "st1 { v6.4s }, [x20], #0x10\n" + "st1 { v11.4s }, [x22], #0x10\n" + "st1 { v12.4s }, [x21], #0x10\n" + "st1 { v13.4s }, [x27], #0x10\n" + "tbz x9, #1, 7f\n" + "st1 { v2.d }[0], [x23], #0x8\n" + "st1 { v10.d }[0], [x25], #0x8\n" + "st1 { v23.d }[0], [x24], #0x8\n" + "st1 { v3.d }[0], [x26], #0x8\n" + "st1 { v31.d }[0], [x20], #0x8\n" + "st1 { v14.d }[0], [x22], #0x8\n" + "st1 { v15.d }[0], [x21], #0x8\n" + "st1 { v22.d }[0], [x27], #0x8\n" + "tbz x9, #0, 10f\n" + "st1 { v2.s }[2], [x23]\n" + "st1 { v10.s }[2], [x25]\n" + "st1 { v23.s }[2], [x24]\n" + "st1 { v3.s }[2], [x26]\n" + "st1 { v31.s }[2], [x20]\n" + "st1 { v14.s }[2], [x22]\n" + "st1 { v15.s }[2], [x21]\n" + "st1 { v22.s }[2], [x27]\n" + "b 10f\n" + "7:" // Output block 0: partial_1_4 + "tbz x9, #0, 10f\n" + "st1 { v2.s }[0], [x23]\n" + "st1 { v10.s }[0], [x25]\n" + "st1 { v23.s }[0], [x24]\n" + "st1 { v3.s }[0], [x26]\n" + "st1 { v31.s }[0], [x20]\n" + "st1 { v14.s }[0], [x22]\n" + "st1 { v15.s }[0], [x21]\n" + "st1 { v22.s }[0], [x27]\n" + "b 10f\n" + "8:" // Output block 0: partial_2_0 + "tbz x9, #1, 9f\n" + "st1 { v30.d }[0], [x23], #0x8\n" + "st1 { v9.d }[0], [x25], #0x8\n" + "st1 { v4.d }[0], [x24], #0x8\n" + "st1 { v25.d }[0], [x26], #0x8\n" + "st1 { v6.d }[0], [x20], #0x8\n" + "st1 { v11.d }[0], [x22], #0x8\n" + "st1 { v12.d }[0], [x21], #0x8\n" + "st1 { v13.d }[0], [x27], #0x8\n" + "tbz x9, #0, 10f\n" + "st1 { v30.s }[2], [x23]\n" + "st1 { v9.s }[2], [x25]\n" + "st1 { v4.s }[2], [x24]\n" + "st1 { v25.s }[2], [x26]\n" + "st1 { v6.s }[2], [x20]\n" + "st1 { v11.s }[2], [x22]\n" + "st1 { v12.s }[2], [x21]\n" + "st1 { v13.s }[2], [x27]\n" + "b 10f\n" + "9:" // Output block 0: partial_1_0 + "st1 { v30.s }[0], [x23]\n" + "st1 { v9.s }[0], [x25]\n" + "st1 { v4.s }[0], [x24]\n" + "st1 { v25.s }[0], [x26]\n" + "st1 { v6.s }[0], [x20]\n" + "st1 { v11.s }[0], [x22]\n" + "st1 { v12.s }[0], [x21]\n" + "st1 { v13.s }[0], [x27]\n" + "10:" // Output block 0: Done + "11:" // Output stage exit + "subs x9, x9, #0x8\n" + "add %x[dst], %x[dst], #0x20\n" + "bgt 2b\n" + "mov x20, #0x2\n" + "sub x12, x12, #0x8\n" + "cmp x12, #0x8\n" + "mov %x[dst], x28\n" + "madd %x[lhs_packed], x20, x11, %x[lhs_packed]\n" + "bge 1b\n" + "12:" // Row loop skip + "cbz x12, 23f\n" + "13:" // Row tail: Row loop + "mov x26, %x[rhs_packed]\n" + "mov x25, %x[n]\n" + "add x24, %x[dst], %x[dst_stride_row], LSL #2\n" + "14:" // Row tail: Column loop + "mov x22, %x[lhs_packed]\n" + "movi v13.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "mov x20, %x[num_blocks]\n" + "movi v12.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v6.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "15:" // Row tail: Sub block loop + "ldr q17, [x26, #0x0]\n" + "ldr q8, [x26, #0x10]\n" + "subs x20, x20, #0x1\n" + "ldr q7, [x22, #0x0]\n" + "ldr q5, [x26, #0x20]\n" + "ldr q4, [x26, #0x30]\n" + "ldr q3, [x22, #0x10]\n" + "ldr q2, [x26, #0x40]\n" + "ldr q10, [x26, #0x50]\n" + "shl v21.16b, v17.16b, #0x4\n" + "shl v18.16b, v8.16b, #0x4\n" + "ldr q20, [x22, #0x20]\n" + "ldr q30, [x26, #0x60]\n" + "shl v24.16b, v5.16b, #0x4\n" + "and v17.16b, v17.16b, v16.16b\n" + "ldr q28, [x26, #0x70]\n" + "ldr q27, [x22, #0x30]\n" + "shl v26.16b, v4.16b, #0x4\n" + "and v8.16b, v8.16b, v16.16b\n" + "ldr q0, [x22, #0x40]\n" + "ldr q19, [x22, #0x50]\n" + ".inst 0x4f87e2ad // sdot v13.4s, v21.16b, v7.4b[0]\n" + ".inst 0x4f87e256 // sdot v22.4s, v18.16b, v7.4b[0]\n" + "ldr q23, [x22, #0x60]\n" + "ldr q25, [x22, #0x70]\n" + ".inst 0x4fa7e2ac // sdot v12.4s, v21.16b, v7.4b[1]\n" + ".inst 0x4fa7e24f // sdot v15.4s, v18.16b, v7.4b[1]\n" + ".inst 0x4f87eaab // sdot v11.4s, v21.16b, v7.4b[2]\n" + ".inst 0x4f87ea4e // sdot v14.4s, v18.16b, v7.4b[2]\n" + "shl v29.16b, v2.16b, #0x4\n" + "add x26, x26, #0x80\n" + ".inst 0x4fa7eaa6 // sdot v6.4s, v21.16b, v7.4b[3]\n" + ".inst 0x4fa7ea5f // sdot v31.4s, v18.16b, v7.4b[3]\n" + "shl v1.16b, v10.16b, #0x4\n" + "add x22, x22, #0x80\n" + ".inst 0x4f83e30d // sdot v13.4s, v24.16b, v3.4b[0]\n" + ".inst 0x4f83e356 // sdot v22.4s, v26.16b, v3.4b[0]\n" + "shl v21.16b, v30.16b, #0x4\n" + ".inst 0x4fa3e30c // sdot v12.4s, v24.16b, v3.4b[1]\n" + ".inst 0x4fa3e34f // sdot v15.4s, v26.16b, v3.4b[1]\n" + "shl v18.16b, v28.16b, #0x4\n" + ".inst 0x4f83eb0b // sdot v11.4s, v24.16b, v3.4b[2]\n" + ".inst 0x4f83eb4e // sdot v14.4s, v26.16b, v3.4b[2]\n" + "and v5.16b, v5.16b, v16.16b\n" + ".inst 0x4fa3eb06 // sdot v6.4s, v24.16b, v3.4b[3]\n" + ".inst 0x4fa3eb5f // sdot v31.4s, v26.16b, v3.4b[3]\n" + "and v4.16b, v4.16b, v16.16b\n" + ".inst 0x4f94e3ad // sdot v13.4s, v29.16b, v20.4b[0]\n" + ".inst 0x4f94e036 // sdot v22.4s, v1.16b, v20.4b[0]\n" + "and v2.16b, v2.16b, v16.16b\n" + ".inst 0x4fb4e3ac // sdot v12.4s, v29.16b, v20.4b[1]\n" + ".inst 0x4fb4e02f // sdot v15.4s, v1.16b, v20.4b[1]\n" + "and v10.16b, v10.16b, v16.16b\n" + ".inst 0x4f94ebab // sdot v11.4s, v29.16b, v20.4b[2]\n" + ".inst 0x4f94e82e // sdot v14.4s, v1.16b, v20.4b[2]\n" + "and v30.16b, v30.16b, v16.16b\n" + ".inst 0x4fb4eba6 // sdot v6.4s, v29.16b, v20.4b[3]\n" + ".inst 0x4fb4e83f // sdot v31.4s, v1.16b, v20.4b[3]\n" + "and v28.16b, v28.16b, v16.16b\n" + ".inst 0x4f9be2ad // sdot v13.4s, v21.16b, v27.4b[0]\n" + ".inst 0x4f9be256 // sdot v22.4s, v18.16b, v27.4b[0]\n" + ".inst 0x4fbbe2ac // sdot v12.4s, v21.16b, v27.4b[1]\n" + ".inst 0x4fbbe24f // sdot v15.4s, v18.16b, v27.4b[1]\n" + ".inst 0x4f9beaab // sdot v11.4s, v21.16b, v27.4b[2]\n" + ".inst 0x4f9bea4e // sdot v14.4s, v18.16b, v27.4b[2]\n" + ".inst 0x4fbbeaa6 // sdot v6.4s, v21.16b, v27.4b[3]\n" + ".inst 0x4fbbea5f // sdot v31.4s, v18.16b, v27.4b[3]\n" + ".inst 0x4f80e22d // sdot v13.4s, v17.16b, v0.4b[0]\n" + ".inst 0x4f80e116 // sdot v22.4s, v8.16b, v0.4b[0]\n" + ".inst 0x4fa0e22c // sdot v12.4s, v17.16b, v0.4b[1]\n" + ".inst 0x4fa0e10f // sdot v15.4s, v8.16b, v0.4b[1]\n" + ".inst 0x4f80ea2b // sdot v11.4s, v17.16b, v0.4b[2]\n" + ".inst 0x4f80e90e // sdot v14.4s, v8.16b, v0.4b[2]\n" + ".inst 0x4fa0ea26 // sdot v6.4s, v17.16b, v0.4b[3]\n" + ".inst 0x4fa0e91f // sdot v31.4s, v8.16b, v0.4b[3]\n" + ".inst 0x4f93e0ad // sdot v13.4s, v5.16b, v19.4b[0]\n" + ".inst 0x4f93e096 // sdot v22.4s, v4.16b, v19.4b[0]\n" + ".inst 0x4fb3e0ac // sdot v12.4s, v5.16b, v19.4b[1]\n" + ".inst 0x4fb3e08f // sdot v15.4s, v4.16b, v19.4b[1]\n" + ".inst 0x4f93e8ab // sdot v11.4s, v5.16b, v19.4b[2]\n" + ".inst 0x4f93e88e // sdot v14.4s, v4.16b, v19.4b[2]\n" + ".inst 0x4fb3e8a6 // sdot v6.4s, v5.16b, v19.4b[3]\n" + ".inst 0x4fb3e89f // sdot v31.4s, v4.16b, v19.4b[3]\n" + ".inst 0x4f97e04d // sdot v13.4s, v2.16b, v23.4b[0]\n" + ".inst 0x4f97e156 // sdot v22.4s, v10.16b, v23.4b[0]\n" + ".inst 0x4fb7e04c // sdot v12.4s, v2.16b, v23.4b[1]\n" + ".inst 0x4fb7e14f // sdot v15.4s, v10.16b, v23.4b[1]\n" + ".inst 0x4f97e84b // sdot v11.4s, v2.16b, v23.4b[2]\n" + ".inst 0x4f97e94e // sdot v14.4s, v10.16b, v23.4b[2]\n" + ".inst 0x4fb7e846 // sdot v6.4s, v2.16b, v23.4b[3]\n" + ".inst 0x4fb7e95f // sdot v31.4s, v10.16b, v23.4b[3]\n" + ".inst 0x4f99e3cd // sdot v13.4s, v30.16b, v25.4b[0]\n" + ".inst 0x4f99e396 // sdot v22.4s, v28.16b, v25.4b[0]\n" + ".inst 0x4fb9e3cc // sdot v12.4s, v30.16b, v25.4b[1]\n" + ".inst 0x4fb9e38f // sdot v15.4s, v28.16b, v25.4b[1]\n" + ".inst 0x4f99ebcb // sdot v11.4s, v30.16b, v25.4b[2]\n" + ".inst 0x4f99eb8e // sdot v14.4s, v28.16b, v25.4b[2]\n" + ".inst 0x4fb9ebc6 // sdot v6.4s, v30.16b, v25.4b[3]\n" + ".inst 0x4fb9eb9f // sdot v31.4s, v28.16b, v25.4b[3]\n" + "bgt 15b\n" + "ldr q21, [x26, #0x0]\n" + "ldr q20, [x26, #0x10]\n" + "ld1 { v5.4s }, [x22]\n" + "ldr q18, [x26, #0x20]\n" + "add x22, x22, #0x10\n" + "ldr q29, [x26, #0x30]\n" + "ldr q19, [x22, #0x0]\n" + "add x26, x26, #0x40\n" + "mla v13.4s, v21.4s, v5.s[0]\n" + "mla v22.4s, v20.4s, v5.s[0]\n" + "mla v12.4s, v21.4s, v5.s[1]\n" + "mla v15.4s, v20.4s, v5.s[1]\n" + "mla v11.4s, v21.4s, v5.s[2]\n" + "mla v14.4s, v20.4s, v5.s[2]\n" + "mla v6.4s, v21.4s, v5.s[3]\n" + "fmul v24.4s, v18.4s, v19.s[0]\n" + "mla v31.4s, v20.4s, v5.s[3]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v25.4s, v29.4s, v19.s[0]\n" + "scvtf v22.4s, v22.4s\n" + "fmul v21.4s, v18.4s, v19.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v20.4s, v29.4s, v19.s[1]\n" + "scvtf v15.4s, v15.4s\n" + "fmul v9.4s, v18.4s, v19.s[2]\n" + "scvtf v11.4s, v11.4s\n" + "fmul v4.4s, v29.4s, v19.s[2]\n" + "scvtf v14.4s, v14.4s\n" + "fmul v5.4s, v18.4s, v19.s[3]\n" + "scvtf v6.4s, v6.4s\n" + "fmul v27.4s, v29.4s, v19.s[3]\n" + "scvtf v31.4s, v31.4s\n" + "fmul v13.4s, v13.4s, v24.4s\n" + "fmul v22.4s, v22.4s, v25.4s\n" + "fmul v12.4s, v12.4s, v21.4s\n" + "fmul v15.4s, v15.4s, v20.4s\n" + "fmul v11.4s, v11.4s, v9.4s\n" + "fmul v14.4s, v14.4s, v4.4s\n" + "fmul v6.4s, v6.4s, v5.4s\n" + "fmul v31.4s, v31.4s, v27.4s\n" + "ld1r { v7.4s }, [%x[clamp_vals]]\n" + "add x20, %x[clamp_vals], #0x4\n" + "cmp x25, #0x8\n" + "ld1r { v29.4s }, [x20]\n" + "add x26, x26, #0x20\n" + "fmax v13.4s, v13.4s, v7.4s\n" + "fmax v22.4s, v22.4s, v7.4s\n" + "fmax v12.4s, v12.4s, v7.4s\n" + "fmax v15.4s, v15.4s, v7.4s\n" + "fmax v11.4s, v11.4s, v7.4s\n" + "fmax v14.4s, v14.4s, v7.4s\n" + "fmax v6.4s, v6.4s, v7.4s\n" + "fmax v31.4s, v31.4s, v7.4s\n" + "fmin v13.4s, v13.4s, v29.4s\n" + "fmin v22.4s, v22.4s, v29.4s\n" + "fmin v12.4s, v12.4s, v29.4s\n" + "fmin v15.4s, v15.4s, v29.4s\n" + "fmin v11.4s, v11.4s, v29.4s\n" + "fmin v14.4s, v14.4s, v29.4s\n" + "fmin v6.4s, v6.4s, v29.4s\n" + "fmin v31.4s, v31.4s, v29.4s\n" + "blt 17f\n" + "mov x20, %x[dst]\n" + "cmp x12, #0x1\n" + "str q13, [x20, #0x0]\n" + "str q22, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "cmp x12, #0x2\n" + "str q12, [x20, #0x0]\n" + "str q15, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "cmp x12, #0x3\n" + "str q11, [x20, #0x0]\n" + "str q14, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "ble 22f\n" + "str q6, [x20, #0x0]\n" + "str q31, [x20, #0x10]\n" + "b 22f\n" + "17:" // Row tail: Partial output + "mov x23, %x[dst]\n" + "cmp x12, #0x1\n" + "add x22, x23, %x[dst_stride_row]\n" + "csel x22, x22, x23, GT\n" + "cmp x12, #0x2\n" + "add x21, x23, %x[dst_stride_row], LSL #1\n" + "csel x21, x21, x22, GT\n" + "cmp x12, #0x3\n" + "add x20, x21, %x[dst_stride_row]\n" + "csel x20, x20, x21, GT\n" + "tbz x25, #2, 19f\n" + "st1 { v6.4s }, [x20], #0x10\n" + "st1 { v11.4s }, [x21], #0x10\n" + "st1 { v12.4s }, [x22], #0x10\n" + "st1 { v13.4s }, [x23], #0x10\n" + "tbz x25, #1, 18f\n" + "st1 { v31.d }[0], [x20], #0x8\n" + "st1 { v14.d }[0], [x21], #0x8\n" + "st1 { v15.d }[0], [x22], #0x8\n" + "st1 { v22.d }[0], [x23], #0x8\n" + "tbz x25, #0, 21f\n" + "st1 { v31.s }[2], [x20]\n" + "st1 { v14.s }[2], [x21]\n" + "st1 { v15.s }[2], [x22]\n" + "st1 { v22.s }[2], [x23]\n" + "b 21f\n" + "18:" // Row tail: Output block 0: partial_1_4 + "tbz x25, #0, 21f\n" + "st1 { v31.s }[0], [x20]\n" + "st1 { v14.s }[0], [x21]\n" + "st1 { v15.s }[0], [x22]\n" + "st1 { v22.s }[0], [x23]\n" + "b 21f\n" + "19:" // Row tail: Output block 0: partial_2_0 + "tbz x25, #1, 20f\n" + "st1 { v6.d }[0], [x20], #0x8\n" + "st1 { v11.d }[0], [x21], #0x8\n" + "st1 { v12.d }[0], [x22], #0x8\n" + "st1 { v13.d }[0], [x23], #0x8\n" + "tbz x25, #0, 21f\n" + "st1 { v6.s }[2], [x20]\n" + "st1 { v11.s }[2], [x21]\n" + "st1 { v12.s }[2], [x22]\n" + "st1 { v13.s }[2], [x23]\n" + "b 21f\n" + "20:" // Row tail: Output block 0: partial_1_0 + "st1 { v6.s }[0], [x20]\n" + "st1 { v11.s }[0], [x21]\n" + "st1 { v12.s }[0], [x22]\n" + "st1 { v13.s }[0], [x23]\n" + "21:" // Row tail: Output block 0: Done + "22:" // Row tail: Output stage exit + "subs x25, x25, #0x8\n" + "add %x[dst], %x[dst], #0x20\n" + "bgt 14b\n" + "subs x12, x12, #0x4\n" + "add %x[lhs_packed], %x[lhs_packed], x11\n" + "mov %x[dst], x24\n" + "bgt 13b\n" + "23:" // Row tail: Row loop skip + : [dst] "+&r"(dst), [lhs_packed] "+&r"(lhs_packed) + : [clamp_vals] "r"(clamp_vals), [dst_stride_row] "r"(dst_stride_row), [m] "r"(m), [n] "r"(n), + [num_blocks] "r"(num_blocks), [rhs_packed] "r"(rhs_packed) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} + +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h new file mode 100644 index 00000000..c6f36e29 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h @@ -0,0 +1,137 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the mr value, which must be used to pack the LHS matrix +/// +/// @return the mr value +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); + +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed Signed 8-bit quantized asymmetric per-row (qai8dxp) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8 +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m_idx, size_t k); + +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed Signed 4-bit quantized symmetric per-channel (qsi4cxp) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( + size_t n_idx, // + size_t k); + +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 8. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( + size_t m_idx, // + size_t n_idx, // + size_t dst_stride); + +/// Gets the size in bytes for the destination (DST) matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination (DST) matrix size in bytes +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m, size_t n); + +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dxp) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cxp) and packed. +/// Output tile: (rows x cols) = 8 x 8 +/// Accumulation performed in a single for loop: 32 +/// Extension used: dotprod +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension between the LHS and RHS matrix. +/// @param[in] lhs_packed The LHS packed matrix. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref +/// kai_run_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 +/// OR kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst The DST matrix. +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. It must be sizeof(float). +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( + size_t m, size_t n, size_t k, + + const void* lhs_packed, // + const void* rhs_packed, // + float* dst, // + size_t dst_stride_row, // + size_t dst_stride_col, // + float scalar_min, // + float scalar_max); // + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus -- GitLab From 41d2695ae8749b8316977b615660aa3a6960bc92 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 25 Sep 2024 12:11:03 +0100 Subject: [PATCH 02/12] Fix the mr, kr and nr values for dotprod matmul kernels Signed-off-by: Anitha Raj --- ...l_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c | 6 +++--- ...ul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c index 741a6b8c..b87ae257 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c @@ -16,10 +16,10 @@ #include "kai/kai_common.h" static const size_t kai_m_step = 16; -static const size_t kai_n_step = 8; +static const size_t kai_n_step = 4; static const size_t kai_mr = 4; -static const size_t kai_nr = 8; -static const size_t kai_kr = 16; +static const size_t kai_nr = 4; +static const size_t kai_kr = 8; static const size_t kai_sr = 2; static const size_t kai_num_bytes_multiplier_lhs = sizeof(float); static const size_t kai_num_bytes_multiplier_rhs = sizeof(float); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c index 69e58159..e6c745f0 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c @@ -17,9 +17,9 @@ static const size_t kai_m_step = 8; static const size_t kai_n_step = 8; -static const size_t kai_mr = 1; +static const size_t kai_mr = 4; static const size_t kai_nr = 8; -static const size_t kai_kr = 16; +static const size_t kai_kr = 8; static const size_t kai_sr = 2; static const size_t kai_num_bytes_multiplier_lhs = sizeof(float); static const size_t kai_num_bytes_multiplier_rhs = sizeof(float); -- GitLab From 35edbdade4910a8a4314df3c890c60dd8d74fe5e Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 25 Sep 2024 14:12:55 +0100 Subject: [PATCH 03/12] Add dotprod matmul ukernels to Build.bazel and benchmark Signed-off-by: Anitha Raj --- kai/ukernels/matmul/BUILD.bazel | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 67aa0d9e..59c00e0c 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -71,6 +71,26 @@ kai_c_library( ], ) +kai_c_library( + name = "clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod", + srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c"], + hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h"], + cpu_uarch = kai_cpu_dotprod(), + deps = [ + ":clamp_f32_qai8dxp_qsi4cxp_interface", + ], +) + +kai_c_library( + name = "clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod", + srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c"], + hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h"], + cpu_uarch = kai_cpu_dotprod(), + deps = [ + ":clamp_f32_qai8dxp_qsi4cxp_interface", + ], +) + kai_c_library( name = "clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm", srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c"], @@ -283,7 +303,9 @@ kai_c_library( ":clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm", ":clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm", ":clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm", + ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod", ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm", + ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod", ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm", ":clamp_f32_qai8dxp_qsi4c32p_interface", ":clamp_f32_qsi8d32p_qsi4c32p_dotprod", -- GitLab From 52c78058c371e17f9c8a4dc170e455ac9e2d1b16 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 25 Sep 2024 14:43:53 +0100 Subject: [PATCH 04/12] Add bias in the dotprod int4 matmul ukernels Signed-off-by: Anitha Raj --- ...i8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c | 202 +-- ...ai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c | 1086 +++++++++-------- 2 files changed, 669 insertions(+), 619 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c index b87ae257..0dea488e 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c @@ -155,14 +155,14 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( "ldr q3, [x20, #0x0]\n" "ldr q12, [x11, #0x10]\n" "ldr q8, [x27, #0x10]\n" - "ldr q7, [x22, #0x10]\n" + "ldr q4, [x22, #0x10]\n" "shl v9.16b, v13.16b, #0x4\n" "and v13.16b, v13.16b, v10.16b\n" - "ldr q2, [x21, #0x10]\n" + "ldr q0, [x21, #0x10]\n" "ldr q1, [x20, #0x10]\n" "ldr q5, [x11, #0x20]\n" - "ldr q4, [x27, #0x20]\n" - "shl v0.16b, v12.16b, #0x4\n" + "ldr q2, [x27, #0x20]\n" + "shl v7.16b, v12.16b, #0x4\n" "and v12.16b, v12.16b, v10.16b\n" "ldr q11, [x22, #0x20]\n" ".inst 0x4f8ee13f // sdot v31.4s, v9.16b, v14.4b[0]\n" @@ -187,89 +187,89 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( ".inst 0x4fa3e930 // sdot v16.4s, v9.16b, v3.4b[3]\n" "ldr q9, [x27, #0x30]\n" "ldr q3, [x22, #0x30]\n" - ".inst 0x4f88e01f // sdot v31.4s, v0.16b, v8.4b[0]\n" - ".inst 0x4fa8e01e // sdot v30.4s, v0.16b, v8.4b[1]\n" - ".inst 0x4f88e81d // sdot v29.4s, v0.16b, v8.4b[2]\n" - ".inst 0x4fa8e81c // sdot v28.4s, v0.16b, v8.4b[3]\n" + ".inst 0x4f88e0ff // sdot v31.4s, v7.16b, v8.4b[0]\n" + ".inst 0x4fa8e0fe // sdot v30.4s, v7.16b, v8.4b[1]\n" + ".inst 0x4f88e8fd // sdot v29.4s, v7.16b, v8.4b[2]\n" + ".inst 0x4fa8e8fc // sdot v28.4s, v7.16b, v8.4b[3]\n" "ldr q8, [x21, #0x30]\n" - ".inst 0x4f87e01b // sdot v27.4s, v0.16b, v7.4b[0]\n" - ".inst 0x4fa7e01a // sdot v26.4s, v0.16b, v7.4b[1]\n" - ".inst 0x4f87e819 // sdot v25.4s, v0.16b, v7.4b[2]\n" - ".inst 0x4fa7e818 // sdot v24.4s, v0.16b, v7.4b[3]\n" - "ldr q7, [x20, #0x30]\n" - ".inst 0x4f82e017 // sdot v23.4s, v0.16b, v2.4b[0]\n" - ".inst 0x4fa2e016 // sdot v22.4s, v0.16b, v2.4b[1]\n" - ".inst 0x4f82e815 // sdot v21.4s, v0.16b, v2.4b[2]\n" - ".inst 0x4fa2e814 // sdot v20.4s, v0.16b, v2.4b[3]\n" - "ldr q2, [x27, #0x40]\n" - ".inst 0x4f81e013 // sdot v19.4s, v0.16b, v1.4b[0]\n" - ".inst 0x4fa1e012 // sdot v18.4s, v0.16b, v1.4b[1]\n" - ".inst 0x4f81e811 // sdot v17.4s, v0.16b, v1.4b[2]\n" - ".inst 0x4fa1e810 // sdot v16.4s, v0.16b, v1.4b[3]\n" - "ldr q0, [x22, #0x40]\n" - "shl v1.16b, v5.16b, #0x4\n" + ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa4e0fa // sdot v26.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa4e8f8 // sdot v24.4s, v7.16b, v4.4b[3]\n" + "ldr q4, [x20, #0x30]\n" + ".inst 0x4f80e0f7 // sdot v23.4s, v7.16b, v0.4b[0]\n" + ".inst 0x4fa0e0f6 // sdot v22.4s, v7.16b, v0.4b[1]\n" + ".inst 0x4f80e8f5 // sdot v21.4s, v7.16b, v0.4b[2]\n" + ".inst 0x4fa0e8f4 // sdot v20.4s, v7.16b, v0.4b[3]\n" + "ldr q0, [x27, #0x40]\n" + ".inst 0x4f81e0f3 // sdot v19.4s, v7.16b, v1.4b[0]\n" + ".inst 0x4fa1e0f2 // sdot v18.4s, v7.16b, v1.4b[1]\n" + ".inst 0x4f81e8f1 // sdot v17.4s, v7.16b, v1.4b[2]\n" + ".inst 0x4fa1e8f0 // sdot v16.4s, v7.16b, v1.4b[3]\n" + "ldr q1, [x22, #0x40]\n" + "shl v7.16b, v5.16b, #0x4\n" "and v5.16b, v5.16b, v10.16b\n" - ".inst 0x4f84e03f // sdot v31.4s, v1.16b, v4.4b[0]\n" - ".inst 0x4fa4e03e // sdot v30.4s, v1.16b, v4.4b[1]\n" - ".inst 0x4f84e83d // sdot v29.4s, v1.16b, v4.4b[2]\n" - ".inst 0x4fa4e83c // sdot v28.4s, v1.16b, v4.4b[3]\n" - "ldr q4, [x21, #0x40]\n" - ".inst 0x4f8be03b // sdot v27.4s, v1.16b, v11.4b[0]\n" - ".inst 0x4fabe03a // sdot v26.4s, v1.16b, v11.4b[1]\n" - ".inst 0x4f8be839 // sdot v25.4s, v1.16b, v11.4b[2]\n" - ".inst 0x4fabe838 // sdot v24.4s, v1.16b, v11.4b[3]\n" + ".inst 0x4f82e0ff // sdot v31.4s, v7.16b, v2.4b[0]\n" + ".inst 0x4fa2e0fe // sdot v30.4s, v7.16b, v2.4b[1]\n" + ".inst 0x4f82e8fd // sdot v29.4s, v7.16b, v2.4b[2]\n" + ".inst 0x4fa2e8fc // sdot v28.4s, v7.16b, v2.4b[3]\n" + "ldr q2, [x21, #0x40]\n" + ".inst 0x4f8be0fb // sdot v27.4s, v7.16b, v11.4b[0]\n" + ".inst 0x4fabe0fa // sdot v26.4s, v7.16b, v11.4b[1]\n" + ".inst 0x4f8be8f9 // sdot v25.4s, v7.16b, v11.4b[2]\n" + ".inst 0x4fabe8f8 // sdot v24.4s, v7.16b, v11.4b[3]\n" "ldr q11, [x20, #0x40]\n" - ".inst 0x4f8ee037 // sdot v23.4s, v1.16b, v14.4b[0]\n" - ".inst 0x4faee036 // sdot v22.4s, v1.16b, v14.4b[1]\n" - ".inst 0x4f8ee835 // sdot v21.4s, v1.16b, v14.4b[2]\n" - ".inst 0x4faee834 // sdot v20.4s, v1.16b, v14.4b[3]\n" + ".inst 0x4f8ee0f7 // sdot v23.4s, v7.16b, v14.4b[0]\n" + ".inst 0x4faee0f6 // sdot v22.4s, v7.16b, v14.4b[1]\n" + ".inst 0x4f8ee8f5 // sdot v21.4s, v7.16b, v14.4b[2]\n" + ".inst 0x4faee8f4 // sdot v20.4s, v7.16b, v14.4b[3]\n" "ldr q14, [x27, #0x50]\n" - ".inst 0x4f86e033 // sdot v19.4s, v1.16b, v6.4b[0]\n" - ".inst 0x4fa6e032 // sdot v18.4s, v1.16b, v6.4b[1]\n" - ".inst 0x4f86e831 // sdot v17.4s, v1.16b, v6.4b[2]\n" - ".inst 0x4fa6e830 // sdot v16.4s, v1.16b, v6.4b[3]\n" + ".inst 0x4f86e0f3 // sdot v19.4s, v7.16b, v6.4b[0]\n" + ".inst 0x4fa6e0f2 // sdot v18.4s, v7.16b, v6.4b[1]\n" + ".inst 0x4f86e8f1 // sdot v17.4s, v7.16b, v6.4b[2]\n" + ".inst 0x4fa6e8f0 // sdot v16.4s, v7.16b, v6.4b[3]\n" "ldr q6, [x22, #0x50]\n" - "shl v1.16b, v15.16b, #0x4\n" + "shl v7.16b, v15.16b, #0x4\n" "and v15.16b, v15.16b, v10.16b\n" "ldr q10, [x21, #0x50]\n" - ".inst 0x4f89e03f // sdot v31.4s, v1.16b, v9.4b[0]\n" - ".inst 0x4fa9e03e // sdot v30.4s, v1.16b, v9.4b[1]\n" - ".inst 0x4f89e83d // sdot v29.4s, v1.16b, v9.4b[2]\n" - ".inst 0x4fa9e83c // sdot v28.4s, v1.16b, v9.4b[3]\n" + ".inst 0x4f89e0ff // sdot v31.4s, v7.16b, v9.4b[0]\n" + ".inst 0x4fa9e0fe // sdot v30.4s, v7.16b, v9.4b[1]\n" + ".inst 0x4f89e8fd // sdot v29.4s, v7.16b, v9.4b[2]\n" + ".inst 0x4fa9e8fc // sdot v28.4s, v7.16b, v9.4b[3]\n" "ldr q9, [x20, #0x50]\n" - ".inst 0x4f83e03b // sdot v27.4s, v1.16b, v3.4b[0]\n" - ".inst 0x4fa3e03a // sdot v26.4s, v1.16b, v3.4b[1]\n" - ".inst 0x4f83e839 // sdot v25.4s, v1.16b, v3.4b[2]\n" - ".inst 0x4fa3e838 // sdot v24.4s, v1.16b, v3.4b[3]\n" + ".inst 0x4f83e0fb // sdot v27.4s, v7.16b, v3.4b[0]\n" + ".inst 0x4fa3e0fa // sdot v26.4s, v7.16b, v3.4b[1]\n" + ".inst 0x4f83e8f9 // sdot v25.4s, v7.16b, v3.4b[2]\n" + ".inst 0x4fa3e8f8 // sdot v24.4s, v7.16b, v3.4b[3]\n" "ldr q3, [x27, #0x60]\n" - ".inst 0x4f88e037 // sdot v23.4s, v1.16b, v8.4b[0]\n" - ".inst 0x4fa8e036 // sdot v22.4s, v1.16b, v8.4b[1]\n" - ".inst 0x4f88e835 // sdot v21.4s, v1.16b, v8.4b[2]\n" - ".inst 0x4fa8e834 // sdot v20.4s, v1.16b, v8.4b[3]\n" + ".inst 0x4f88e0f7 // sdot v23.4s, v7.16b, v8.4b[0]\n" + ".inst 0x4fa8e0f6 // sdot v22.4s, v7.16b, v8.4b[1]\n" + ".inst 0x4f88e8f5 // sdot v21.4s, v7.16b, v8.4b[2]\n" + ".inst 0x4fa8e8f4 // sdot v20.4s, v7.16b, v8.4b[3]\n" "ldr q8, [x22, #0x60]\n" - ".inst 0x4f87e033 // sdot v19.4s, v1.16b, v7.4b[0]\n" - ".inst 0x4fa7e032 // sdot v18.4s, v1.16b, v7.4b[1]\n" - ".inst 0x4f87e831 // sdot v17.4s, v1.16b, v7.4b[2]\n" - ".inst 0x4fa7e830 // sdot v16.4s, v1.16b, v7.4b[3]\n" + ".inst 0x4f84e0f3 // sdot v19.4s, v7.16b, v4.4b[0]\n" + ".inst 0x4fa4e0f2 // sdot v18.4s, v7.16b, v4.4b[1]\n" + ".inst 0x4f84e8f1 // sdot v17.4s, v7.16b, v4.4b[2]\n" + ".inst 0x4fa4e8f0 // sdot v16.4s, v7.16b, v4.4b[3]\n" "ldr q7, [x21, #0x60]\n" - "ldr q1, [x20, #0x60]\n" - ".inst 0x4f82e1bf // sdot v31.4s, v13.16b, v2.4b[0]\n" - ".inst 0x4fa2e1be // sdot v30.4s, v13.16b, v2.4b[1]\n" - ".inst 0x4f82e9bd // sdot v29.4s, v13.16b, v2.4b[2]\n" - ".inst 0x4fa2e9bc // sdot v28.4s, v13.16b, v2.4b[3]\n" - "ldr q2, [x27, #0x70]\n" + "ldr q4, [x20, #0x60]\n" + ".inst 0x4f80e1bf // sdot v31.4s, v13.16b, v0.4b[0]\n" + ".inst 0x4fa0e1be // sdot v30.4s, v13.16b, v0.4b[1]\n" + ".inst 0x4f80e9bd // sdot v29.4s, v13.16b, v0.4b[2]\n" + ".inst 0x4fa0e9bc // sdot v28.4s, v13.16b, v0.4b[3]\n" + "ldr q0, [x27, #0x70]\n" "add x27, x27, #0x80\n" - ".inst 0x4f80e1bb // sdot v27.4s, v13.16b, v0.4b[0]\n" - ".inst 0x4fa0e1ba // sdot v26.4s, v13.16b, v0.4b[1]\n" - ".inst 0x4f80e9b9 // sdot v25.4s, v13.16b, v0.4b[2]\n" - ".inst 0x4fa0e9b8 // sdot v24.4s, v13.16b, v0.4b[3]\n" - "ldr q0, [x22, #0x70]\n" + ".inst 0x4f81e1bb // sdot v27.4s, v13.16b, v1.4b[0]\n" + ".inst 0x4fa1e1ba // sdot v26.4s, v13.16b, v1.4b[1]\n" + ".inst 0x4f81e9b9 // sdot v25.4s, v13.16b, v1.4b[2]\n" + ".inst 0x4fa1e9b8 // sdot v24.4s, v13.16b, v1.4b[3]\n" + "ldr q1, [x22, #0x70]\n" "add x22, x22, #0x80\n" - ".inst 0x4f84e1b7 // sdot v23.4s, v13.16b, v4.4b[0]\n" - ".inst 0x4fa4e1b6 // sdot v22.4s, v13.16b, v4.4b[1]\n" - ".inst 0x4f84e9b5 // sdot v21.4s, v13.16b, v4.4b[2]\n" - ".inst 0x4fa4e9b4 // sdot v20.4s, v13.16b, v4.4b[3]\n" - "ldr q4, [x21, #0x70]\n" + ".inst 0x4f82e1b7 // sdot v23.4s, v13.16b, v2.4b[0]\n" + ".inst 0x4fa2e1b6 // sdot v22.4s, v13.16b, v2.4b[1]\n" + ".inst 0x4f82e9b5 // sdot v21.4s, v13.16b, v2.4b[2]\n" + ".inst 0x4fa2e9b4 // sdot v20.4s, v13.16b, v2.4b[3]\n" + "ldr q2, [x21, #0x70]\n" "add x21, x21, #0x80\n" ".inst 0x4f8be1b3 // sdot v19.4s, v13.16b, v11.4b[0]\n" ".inst 0x4fabe1b2 // sdot v18.4s, v13.16b, v11.4b[1]\n" @@ -305,22 +305,22 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( ".inst 0x4fa7e0b6 // sdot v22.4s, v5.16b, v7.4b[1]\n" ".inst 0x4f87e8b5 // sdot v21.4s, v5.16b, v7.4b[2]\n" ".inst 0x4fa7e8b4 // sdot v20.4s, v5.16b, v7.4b[3]\n" - ".inst 0x4f81e0b3 // sdot v19.4s, v5.16b, v1.4b[0]\n" - ".inst 0x4fa1e0b2 // sdot v18.4s, v5.16b, v1.4b[1]\n" - ".inst 0x4f81e8b1 // sdot v17.4s, v5.16b, v1.4b[2]\n" - ".inst 0x4fa1e8b0 // sdot v16.4s, v5.16b, v1.4b[3]\n" - ".inst 0x4f82e1ff // sdot v31.4s, v15.16b, v2.4b[0]\n" - ".inst 0x4fa2e1fe // sdot v30.4s, v15.16b, v2.4b[1]\n" - ".inst 0x4f82e9fd // sdot v29.4s, v15.16b, v2.4b[2]\n" - ".inst 0x4fa2e9fc // sdot v28.4s, v15.16b, v2.4b[3]\n" - ".inst 0x4f80e1fb // sdot v27.4s, v15.16b, v0.4b[0]\n" - ".inst 0x4fa0e1fa // sdot v26.4s, v15.16b, v0.4b[1]\n" - ".inst 0x4f80e9f9 // sdot v25.4s, v15.16b, v0.4b[2]\n" - ".inst 0x4fa0e9f8 // sdot v24.4s, v15.16b, v0.4b[3]\n" - ".inst 0x4f84e1f7 // sdot v23.4s, v15.16b, v4.4b[0]\n" - ".inst 0x4fa4e1f6 // sdot v22.4s, v15.16b, v4.4b[1]\n" - ".inst 0x4f84e9f5 // sdot v21.4s, v15.16b, v4.4b[2]\n" - ".inst 0x4fa4e9f4 // sdot v20.4s, v15.16b, v4.4b[3]\n" + ".inst 0x4f84e0b3 // sdot v19.4s, v5.16b, v4.4b[0]\n" + ".inst 0x4fa4e0b2 // sdot v18.4s, v5.16b, v4.4b[1]\n" + ".inst 0x4f84e8b1 // sdot v17.4s, v5.16b, v4.4b[2]\n" + ".inst 0x4fa4e8b0 // sdot v16.4s, v5.16b, v4.4b[3]\n" + ".inst 0x4f80e1ff // sdot v31.4s, v15.16b, v0.4b[0]\n" + ".inst 0x4fa0e1fe // sdot v30.4s, v15.16b, v0.4b[1]\n" + ".inst 0x4f80e9fd // sdot v29.4s, v15.16b, v0.4b[2]\n" + ".inst 0x4fa0e9fc // sdot v28.4s, v15.16b, v0.4b[3]\n" + ".inst 0x4f81e1fb // sdot v27.4s, v15.16b, v1.4b[0]\n" + ".inst 0x4fa1e1fa // sdot v26.4s, v15.16b, v1.4b[1]\n" + ".inst 0x4f81e9f9 // sdot v25.4s, v15.16b, v1.4b[2]\n" + ".inst 0x4fa1e9f8 // sdot v24.4s, v15.16b, v1.4b[3]\n" + ".inst 0x4f82e1f7 // sdot v23.4s, v15.16b, v2.4b[0]\n" + ".inst 0x4fa2e1f6 // sdot v22.4s, v15.16b, v2.4b[1]\n" + ".inst 0x4f82e9f5 // sdot v21.4s, v15.16b, v2.4b[2]\n" + ".inst 0x4fa2e9f4 // sdot v20.4s, v15.16b, v2.4b[3]\n" ".inst 0x4f8be1f3 // sdot v19.4s, v15.16b, v11.4b[0]\n" ".inst 0x4fabe1f2 // sdot v18.4s, v15.16b, v11.4b[1]\n" ".inst 0x4f8be9f1 // sdot v17.4s, v15.16b, v11.4b[2]\n" @@ -405,11 +405,28 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( "fmul v18.4s, v18.4s, v2.4s\n" "fmul v17.4s, v17.4s, v1.4s\n" "fmul v16.4s, v16.4s, v0.4s\n" + "ldr q2, [x11, #0x0]\n" "ld1r { v1.4s }, [%x[clamp_vals]]\n" "add x20, %x[clamp_vals], #0x4\n" "cmp x10, #0x4\n" "ld1r { v0.4s }, [x20]\n" "add x11, x11, #0x10\n" + "fadd v31.4s, v31.4s, v2.4s\n" + "fadd v30.4s, v30.4s, v2.4s\n" + "fadd v29.4s, v29.4s, v2.4s\n" + "fadd v28.4s, v28.4s, v2.4s\n" + "fadd v27.4s, v27.4s, v2.4s\n" + "fadd v26.4s, v26.4s, v2.4s\n" + "fadd v25.4s, v25.4s, v2.4s\n" + "fadd v24.4s, v24.4s, v2.4s\n" + "fadd v23.4s, v23.4s, v2.4s\n" + "fadd v22.4s, v22.4s, v2.4s\n" + "fadd v21.4s, v21.4s, v2.4s\n" + "fadd v20.4s, v20.4s, v2.4s\n" + "fadd v19.4s, v19.4s, v2.4s\n" + "fadd v18.4s, v18.4s, v2.4s\n" + "fadd v17.4s, v17.4s, v2.4s\n" + "fadd v16.4s, v16.4s, v2.4s\n" "fmax v31.4s, v31.4s, v1.4s\n" "fmax v30.4s, v30.4s, v1.4s\n" "fmax v29.4s, v29.4s, v1.4s\n" @@ -654,11 +671,16 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( "fmul v30.4s, v30.4s, v18.4s\n" "fmul v29.4s, v29.4s, v17.4s\n" "fmul v28.4s, v28.4s, v16.4s\n" + "ldr q18, [x26, #0x0]\n" "ld1r { v17.4s }, [%x[clamp_vals]]\n" "add x20, %x[clamp_vals], #0x4\n" "cmp x25, #0x4\n" "ld1r { v16.4s }, [x20]\n" "add x26, x26, #0x10\n" + "fadd v31.4s, v31.4s, v18.4s\n" + "fadd v30.4s, v30.4s, v18.4s\n" + "fadd v29.4s, v29.4s, v18.4s\n" + "fadd v28.4s, v28.4s, v18.4s\n" "fmax v31.4s, v31.4s, v17.4s\n" "fmax v30.4s, v30.4s, v17.4s\n" "fmax v29.4s, v29.4s, v17.4s\n" diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c index e6c745f0..3d574c23 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c @@ -115,7 +115,7 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( __asm__ __volatile__( "mov x12, %x[m]\n" "mov x11, #0x80\n" - "movi v16.16b, #0xf0\n" + "movi v13.16b, #0xf0\n" "mov x20, #0x20\n" "cmp x12, #0x8\n" "madd x11, %x[num_blocks], x11, x20\n" @@ -126,335 +126,353 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( "add x28, %x[dst], %x[dst_stride_row], LSL #3\n" "2:" // Column loop "mov x22, %x[lhs_packed]\n" - "movi v13.4s, #0x0\n" - "movi v22.4s, #0x0\n" + "movi v6.4s, #0x0\n" + "movi v15.4s, #0x0\n" "mov x21, %x[num_blocks]\n" + "movi v9.4s, #0x0\n" "movi v12.4s, #0x0\n" - "movi v15.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "add x20, x22, x11\n" "movi v11.4s, #0x0\n" "movi v14.4s, #0x0\n" - "add x20, x22, x11\n" - "movi v6.4s, #0x0\n" - "movi v31.4s, #0x0\n" - "movi v25.4s, #0x0\n" - "movi v3.4s, #0x0\n" - "movi v4.4s, #0x0\n" - "movi v23.4s, #0x0\n" - "movi v9.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v8.4s, #0x0\n" + "movi v21.4s, #0x0\n" "movi v10.4s, #0x0\n" - "movi v30.4s, #0x0\n" - "movi v2.4s, #0x0\n" + "movi v4.4s, #0x0\n" + "movi v5.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v3.4s, #0x0\n" "3:" // Sub block loop - "ldr q8, [x10, #0x0]\n" - "ldr q19, [x10, #0x10]\n" + "ldr q31, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" "subs x21, x21, #0x1\n" "ldr q26, [x22, #0x0]\n" - "ldr q1, [x20, #0x0]\n" - "ldr q5, [x10, #0x20]\n" - "ldr q17, [x10, #0x30]\n" - "ldr q29, [x22, #0x10]\n" - "ldr q28, [x20, #0x10]\n" - "shl v24.16b, v8.16b, #0x4\n" - "shl v27.16b, v19.16b, #0x4\n" - "ldr q18, [x10, #0x40]\n" - "ldr q20, [x10, #0x50]\n" - "and v8.16b, v8.16b, v16.16b\n" - "and v19.16b, v19.16b, v16.16b\n" - "ldr q21, [x22, #0x20]\n" - "ldr q7, [x20, #0x20]\n" - "shl v0.16b, v5.16b, #0x4\n" - "and v5.16b, v5.16b, v16.16b\n" - ".inst 0x4f9ae30d // sdot v13.4s, v24.16b, v26.4b[0]\n" - ".inst 0x4f9ae376 // sdot v22.4s, v27.16b, v26.4b[0]\n" - ".inst 0x4fbae30c // sdot v12.4s, v24.16b, v26.4b[1]\n" - ".inst 0x4fbae36f // sdot v15.4s, v27.16b, v26.4b[1]\n" - ".inst 0x4f9aeb0b // sdot v11.4s, v24.16b, v26.4b[2]\n" - ".inst 0x4f9aeb6e // sdot v14.4s, v27.16b, v26.4b[2]\n" - ".inst 0x4fbaeb06 // sdot v6.4s, v24.16b, v26.4b[3]\n" - ".inst 0x4fbaeb7f // sdot v31.4s, v27.16b, v26.4b[3]\n" + "ldr q2, [x20, #0x0]\n" + "ldr q1, [x10, #0x20]\n" + "ldr q16, [x10, #0x30]\n" + "ldr q22, [x22, #0x10]\n" + "ldr q23, [x20, #0x10]\n" + "shl v27.16b, v31.16b, #0x4\n" + "shl v19.16b, v7.16b, #0x4\n" + "ldr q29, [x10, #0x40]\n" + "ldr q25, [x10, #0x50]\n" + "and v31.16b, v31.16b, v13.16b\n" + "and v7.16b, v7.16b, v13.16b\n" + "ldr q24, [x22, #0x20]\n" + "ldr q0, [x20, #0x20]\n" + "shl v18.16b, v1.16b, #0x4\n" + "and v1.16b, v1.16b, v13.16b\n" + ".inst 0x4f9ae366 // sdot v6.4s, v27.16b, v26.4b[0]\n" + ".inst 0x4f9ae26f // sdot v15.4s, v19.16b, v26.4b[0]\n" + ".inst 0x4fbae369 // sdot v9.4s, v27.16b, v26.4b[1]\n" + ".inst 0x4fbae26c // sdot v12.4s, v19.16b, v26.4b[1]\n" + ".inst 0x4f9aeb74 // sdot v20.4s, v27.16b, v26.4b[2]\n" + ".inst 0x4f9aea7e // sdot v30.4s, v19.16b, v26.4b[2]\n" + ".inst 0x4fbaeb6b // sdot v11.4s, v27.16b, v26.4b[3]\n" + ".inst 0x4fbaea6e // sdot v14.4s, v19.16b, v26.4b[3]\n" "ldr q26, [x10, #0x60]\n" - ".inst 0x4f81e319 // sdot v25.4s, v24.16b, v1.4b[0]\n" - ".inst 0x4f81e363 // sdot v3.4s, v27.16b, v1.4b[0]\n" - ".inst 0x4fa1e304 // sdot v4.4s, v24.16b, v1.4b[1]\n" - ".inst 0x4fa1e377 // sdot v23.4s, v27.16b, v1.4b[1]\n" - ".inst 0x4f81eb09 // sdot v9.4s, v24.16b, v1.4b[2]\n" - ".inst 0x4f81eb6a // sdot v10.4s, v27.16b, v1.4b[2]\n" - ".inst 0x4fa1eb1e // sdot v30.4s, v24.16b, v1.4b[3]\n" - "ldr q24, [x10, #0x70]\n" - ".inst 0x4fa1eb62 // sdot v2.4s, v27.16b, v1.4b[3]\n" - "ldr q1, [x22, #0x30]\n" - "ldr q27, [x20, #0x30]\n" - ".inst 0x4f9de00d // sdot v13.4s, v0.16b, v29.4b[0]\n" - ".inst 0x4fbde00c // sdot v12.4s, v0.16b, v29.4b[1]\n" + ".inst 0x4f82e371 // sdot v17.4s, v27.16b, v2.4b[0]\n" + ".inst 0x4f82e268 // sdot v8.4s, v19.16b, v2.4b[0]\n" + ".inst 0x4fa2e375 // sdot v21.4s, v27.16b, v2.4b[1]\n" + ".inst 0x4fa2e26a // sdot v10.4s, v19.16b, v2.4b[1]\n" + ".inst 0x4f82eb64 // sdot v4.4s, v27.16b, v2.4b[2]\n" + ".inst 0x4f82ea65 // sdot v5.4s, v19.16b, v2.4b[2]\n" + ".inst 0x4fa2eb7c // sdot v28.4s, v27.16b, v2.4b[3]\n" + "ldr q27, [x10, #0x70]\n" + ".inst 0x4fa2ea63 // sdot v3.4s, v19.16b, v2.4b[3]\n" + "ldr q2, [x22, #0x30]\n" + "ldr q19, [x20, #0x30]\n" + ".inst 0x4f96e246 // sdot v6.4s, v18.16b, v22.4b[0]\n" + ".inst 0x4fb6e249 // sdot v9.4s, v18.16b, v22.4b[1]\n" "add x10, x10, #0x80\n" - ".inst 0x4f9de80b // sdot v11.4s, v0.16b, v29.4b[2]\n" - ".inst 0x4fbde806 // sdot v6.4s, v0.16b, v29.4b[3]\n" - ".inst 0x4f9ce019 // sdot v25.4s, v0.16b, v28.4b[0]\n" - ".inst 0x4fbce004 // sdot v4.4s, v0.16b, v28.4b[1]\n" - ".inst 0x4f9ce809 // sdot v9.4s, v0.16b, v28.4b[2]\n" - ".inst 0x4fbce81e // sdot v30.4s, v0.16b, v28.4b[3]\n" - "shl v0.16b, v17.16b, #0x4\n" - "and v17.16b, v17.16b, v16.16b\n" - ".inst 0x4f9de016 // sdot v22.4s, v0.16b, v29.4b[0]\n" - ".inst 0x4fbde00f // sdot v15.4s, v0.16b, v29.4b[1]\n" - ".inst 0x4f9de80e // sdot v14.4s, v0.16b, v29.4b[2]\n" - ".inst 0x4fbde81f // sdot v31.4s, v0.16b, v29.4b[3]\n" - "ldr q29, [x22, #0x40]\n" - ".inst 0x4f9ce003 // sdot v3.4s, v0.16b, v28.4b[0]\n" - ".inst 0x4fbce017 // sdot v23.4s, v0.16b, v28.4b[1]\n" - ".inst 0x4f9ce80a // sdot v10.4s, v0.16b, v28.4b[2]\n" - ".inst 0x4fbce802 // sdot v2.4s, v0.16b, v28.4b[3]\n" - "ldr q0, [x20, #0x40]\n" - "shl v28.16b, v18.16b, #0x4\n" - "and v18.16b, v18.16b, v16.16b\n" - ".inst 0x4f95e38d // sdot v13.4s, v28.16b, v21.4b[0]\n" - ".inst 0x4fb5e38c // sdot v12.4s, v28.16b, v21.4b[1]\n" - ".inst 0x4f95eb8b // sdot v11.4s, v28.16b, v21.4b[2]\n" - ".inst 0x4fb5eb86 // sdot v6.4s, v28.16b, v21.4b[3]\n" - ".inst 0x4f87e399 // sdot v25.4s, v28.16b, v7.4b[0]\n" - ".inst 0x4fa7e384 // sdot v4.4s, v28.16b, v7.4b[1]\n" - ".inst 0x4f87eb89 // sdot v9.4s, v28.16b, v7.4b[2]\n" - ".inst 0x4fa7eb9e // sdot v30.4s, v28.16b, v7.4b[3]\n" - "shl v28.16b, v20.16b, #0x4\n" - "and v20.16b, v20.16b, v16.16b\n" - ".inst 0x4f95e396 // sdot v22.4s, v28.16b, v21.4b[0]\n" - ".inst 0x4fb5e38f // sdot v15.4s, v28.16b, v21.4b[1]\n" - ".inst 0x4f95eb8e // sdot v14.4s, v28.16b, v21.4b[2]\n" - ".inst 0x4fb5eb9f // sdot v31.4s, v28.16b, v21.4b[3]\n" - "ldr q21, [x22, #0x50]\n" - ".inst 0x4f87e383 // sdot v3.4s, v28.16b, v7.4b[0]\n" - ".inst 0x4fa7e397 // sdot v23.4s, v28.16b, v7.4b[1]\n" - ".inst 0x4f87eb8a // sdot v10.4s, v28.16b, v7.4b[2]\n" - ".inst 0x4fa7eb82 // sdot v2.4s, v28.16b, v7.4b[3]\n" - "ldr q28, [x20, #0x50]\n" - "shl v7.16b, v26.16b, #0x4\n" - "and v26.16b, v26.16b, v16.16b\n" - ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n" - ".inst 0x4fa1e0ec // sdot v12.4s, v7.16b, v1.4b[1]\n" - ".inst 0x4f81e8eb // sdot v11.4s, v7.16b, v1.4b[2]\n" - ".inst 0x4fa1e8e6 // sdot v6.4s, v7.16b, v1.4b[3]\n" - ".inst 0x4f9be0f9 // sdot v25.4s, v7.16b, v27.4b[0]\n" - ".inst 0x4fbbe0e4 // sdot v4.4s, v7.16b, v27.4b[1]\n" - ".inst 0x4f9be8e9 // sdot v9.4s, v7.16b, v27.4b[2]\n" - ".inst 0x4fbbe8fe // sdot v30.4s, v7.16b, v27.4b[3]\n" - "ldr q7, [x22, #0x60]\n" - ".inst 0x4f9de10d // sdot v13.4s, v8.16b, v29.4b[0]\n" - ".inst 0x4fbde10c // sdot v12.4s, v8.16b, v29.4b[1]\n" - ".inst 0x4f9de90b // sdot v11.4s, v8.16b, v29.4b[2]\n" - ".inst 0x4fbde906 // sdot v6.4s, v8.16b, v29.4b[3]\n" - ".inst 0x4f80e119 // sdot v25.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4fa0e104 // sdot v4.4s, v8.16b, v0.4b[1]\n" - ".inst 0x4f80e909 // sdot v9.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4fa0e91e // sdot v30.4s, v8.16b, v0.4b[3]\n" - "ldr q8, [x20, #0x60]\n" - ".inst 0x4f95e0ad // sdot v13.4s, v5.16b, v21.4b[0]\n" - ".inst 0x4fb5e0ac // sdot v12.4s, v5.16b, v21.4b[1]\n" - ".inst 0x4f95e8ab // sdot v11.4s, v5.16b, v21.4b[2]\n" - ".inst 0x4fb5e8a6 // sdot v6.4s, v5.16b, v21.4b[3]\n" - ".inst 0x4f9ce0b9 // sdot v25.4s, v5.16b, v28.4b[0]\n" - ".inst 0x4fbce0a4 // sdot v4.4s, v5.16b, v28.4b[1]\n" - ".inst 0x4f9ce8a9 // sdot v9.4s, v5.16b, v28.4b[2]\n" - ".inst 0x4fbce8be // sdot v30.4s, v5.16b, v28.4b[3]\n" - "ldr q5, [x22, #0x70]\n" + ".inst 0x4f96ea54 // sdot v20.4s, v18.16b, v22.4b[2]\n" + ".inst 0x4fb6ea4b // sdot v11.4s, v18.16b, v22.4b[3]\n" + ".inst 0x4f97e251 // sdot v17.4s, v18.16b, v23.4b[0]\n" + ".inst 0x4fb7e255 // sdot v21.4s, v18.16b, v23.4b[1]\n" + ".inst 0x4f97ea44 // sdot v4.4s, v18.16b, v23.4b[2]\n" + ".inst 0x4fb7ea5c // sdot v28.4s, v18.16b, v23.4b[3]\n" + "shl v18.16b, v16.16b, #0x4\n" + "and v16.16b, v16.16b, v13.16b\n" + ".inst 0x4f96e24f // sdot v15.4s, v18.16b, v22.4b[0]\n" + ".inst 0x4fb6e24c // sdot v12.4s, v18.16b, v22.4b[1]\n" + ".inst 0x4f96ea5e // sdot v30.4s, v18.16b, v22.4b[2]\n" + ".inst 0x4fb6ea4e // sdot v14.4s, v18.16b, v22.4b[3]\n" + "ldr q22, [x22, #0x40]\n" + ".inst 0x4f97e248 // sdot v8.4s, v18.16b, v23.4b[0]\n" + ".inst 0x4fb7e24a // sdot v10.4s, v18.16b, v23.4b[1]\n" + ".inst 0x4f97ea45 // sdot v5.4s, v18.16b, v23.4b[2]\n" + ".inst 0x4fb7ea43 // sdot v3.4s, v18.16b, v23.4b[3]\n" + "ldr q18, [x20, #0x40]\n" + "shl v23.16b, v29.16b, #0x4\n" + "and v29.16b, v29.16b, v13.16b\n" + ".inst 0x4f98e2e6 // sdot v6.4s, v23.16b, v24.4b[0]\n" + ".inst 0x4fb8e2e9 // sdot v9.4s, v23.16b, v24.4b[1]\n" + ".inst 0x4f98eaf4 // sdot v20.4s, v23.16b, v24.4b[2]\n" + ".inst 0x4fb8eaeb // sdot v11.4s, v23.16b, v24.4b[3]\n" + ".inst 0x4f80e2f1 // sdot v17.4s, v23.16b, v0.4b[0]\n" + ".inst 0x4fa0e2f5 // sdot v21.4s, v23.16b, v0.4b[1]\n" + ".inst 0x4f80eae4 // sdot v4.4s, v23.16b, v0.4b[2]\n" + ".inst 0x4fa0eafc // sdot v28.4s, v23.16b, v0.4b[3]\n" + "shl v23.16b, v25.16b, #0x4\n" + "and v25.16b, v25.16b, v13.16b\n" + ".inst 0x4f98e2ef // sdot v15.4s, v23.16b, v24.4b[0]\n" + ".inst 0x4fb8e2ec // sdot v12.4s, v23.16b, v24.4b[1]\n" + ".inst 0x4f98eafe // sdot v30.4s, v23.16b, v24.4b[2]\n" + ".inst 0x4fb8eaee // sdot v14.4s, v23.16b, v24.4b[3]\n" + "ldr q24, [x22, #0x50]\n" + ".inst 0x4f80e2e8 // sdot v8.4s, v23.16b, v0.4b[0]\n" + ".inst 0x4fa0e2ea // sdot v10.4s, v23.16b, v0.4b[1]\n" + ".inst 0x4f80eae5 // sdot v5.4s, v23.16b, v0.4b[2]\n" + ".inst 0x4fa0eae3 // sdot v3.4s, v23.16b, v0.4b[3]\n" + "ldr q23, [x20, #0x50]\n" + "shl v0.16b, v26.16b, #0x4\n" + "and v26.16b, v26.16b, v13.16b\n" + ".inst 0x4f82e006 // sdot v6.4s, v0.16b, v2.4b[0]\n" + ".inst 0x4fa2e009 // sdot v9.4s, v0.16b, v2.4b[1]\n" + ".inst 0x4f82e814 // sdot v20.4s, v0.16b, v2.4b[2]\n" + ".inst 0x4fa2e80b // sdot v11.4s, v0.16b, v2.4b[3]\n" + ".inst 0x4f93e011 // sdot v17.4s, v0.16b, v19.4b[0]\n" + ".inst 0x4fb3e015 // sdot v21.4s, v0.16b, v19.4b[1]\n" + ".inst 0x4f93e804 // sdot v4.4s, v0.16b, v19.4b[2]\n" + ".inst 0x4fb3e81c // sdot v28.4s, v0.16b, v19.4b[3]\n" + "ldr q0, [x22, #0x60]\n" + ".inst 0x4f96e3e6 // sdot v6.4s, v31.16b, v22.4b[0]\n" + ".inst 0x4fb6e3e9 // sdot v9.4s, v31.16b, v22.4b[1]\n" + ".inst 0x4f96ebf4 // sdot v20.4s, v31.16b, v22.4b[2]\n" + ".inst 0x4fb6ebeb // sdot v11.4s, v31.16b, v22.4b[3]\n" + ".inst 0x4f92e3f1 // sdot v17.4s, v31.16b, v18.4b[0]\n" + ".inst 0x4fb2e3f5 // sdot v21.4s, v31.16b, v18.4b[1]\n" + ".inst 0x4f92ebe4 // sdot v4.4s, v31.16b, v18.4b[2]\n" + ".inst 0x4fb2ebfc // sdot v28.4s, v31.16b, v18.4b[3]\n" + "ldr q31, [x20, #0x60]\n" + ".inst 0x4f98e026 // sdot v6.4s, v1.16b, v24.4b[0]\n" + ".inst 0x4fb8e029 // sdot v9.4s, v1.16b, v24.4b[1]\n" + ".inst 0x4f98e834 // sdot v20.4s, v1.16b, v24.4b[2]\n" + ".inst 0x4fb8e82b // sdot v11.4s, v1.16b, v24.4b[3]\n" + ".inst 0x4f97e031 // sdot v17.4s, v1.16b, v23.4b[0]\n" + ".inst 0x4fb7e035 // sdot v21.4s, v1.16b, v23.4b[1]\n" + ".inst 0x4f97e824 // sdot v4.4s, v1.16b, v23.4b[2]\n" + ".inst 0x4fb7e83c // sdot v28.4s, v1.16b, v23.4b[3]\n" + "ldr q1, [x22, #0x70]\n" "add x22, x22, #0x80\n" - ".inst 0x4f87e24d // sdot v13.4s, v18.16b, v7.4b[0]\n" - ".inst 0x4fa7e24c // sdot v12.4s, v18.16b, v7.4b[1]\n" - ".inst 0x4f87ea4b // sdot v11.4s, v18.16b, v7.4b[2]\n" - ".inst 0x4fa7ea46 // sdot v6.4s, v18.16b, v7.4b[3]\n" - ".inst 0x4f88e259 // sdot v25.4s, v18.16b, v8.4b[0]\n" - ".inst 0x4fa8e244 // sdot v4.4s, v18.16b, v8.4b[1]\n" - ".inst 0x4f88ea49 // sdot v9.4s, v18.16b, v8.4b[2]\n" - ".inst 0x4fa8ea5e // sdot v30.4s, v18.16b, v8.4b[3]\n" - "ldr q18, [x20, #0x70]\n" + ".inst 0x4f80e3a6 // sdot v6.4s, v29.16b, v0.4b[0]\n" + ".inst 0x4fa0e3a9 // sdot v9.4s, v29.16b, v0.4b[1]\n" + ".inst 0x4f80ebb4 // sdot v20.4s, v29.16b, v0.4b[2]\n" + ".inst 0x4fa0ebab // sdot v11.4s, v29.16b, v0.4b[3]\n" + ".inst 0x4f9fe3b1 // sdot v17.4s, v29.16b, v31.4b[0]\n" + ".inst 0x4fbfe3b5 // sdot v21.4s, v29.16b, v31.4b[1]\n" + ".inst 0x4f9feba4 // sdot v4.4s, v29.16b, v31.4b[2]\n" + ".inst 0x4fbfebbc // sdot v28.4s, v29.16b, v31.4b[3]\n" + "ldr q29, [x20, #0x70]\n" "add x20, x20, #0x80\n" - ".inst 0x4f85e34d // sdot v13.4s, v26.16b, v5.4b[0]\n" - ".inst 0x4fa5e34c // sdot v12.4s, v26.16b, v5.4b[1]\n" - ".inst 0x4f85eb4b // sdot v11.4s, v26.16b, v5.4b[2]\n" - ".inst 0x4fa5eb46 // sdot v6.4s, v26.16b, v5.4b[3]\n" - ".inst 0x4f92e359 // sdot v25.4s, v26.16b, v18.4b[0]\n" - ".inst 0x4fb2e344 // sdot v4.4s, v26.16b, v18.4b[1]\n" - ".inst 0x4f92eb49 // sdot v9.4s, v26.16b, v18.4b[2]\n" - ".inst 0x4fb2eb5e // sdot v30.4s, v26.16b, v18.4b[3]\n" - "shl v26.16b, v24.16b, #0x4\n" - "and v24.16b, v24.16b, v16.16b\n" - ".inst 0x4f81e356 // sdot v22.4s, v26.16b, v1.4b[0]\n" - ".inst 0x4fa1e34f // sdot v15.4s, v26.16b, v1.4b[1]\n" - ".inst 0x4f81eb4e // sdot v14.4s, v26.16b, v1.4b[2]\n" - ".inst 0x4fa1eb5f // sdot v31.4s, v26.16b, v1.4b[3]\n" - ".inst 0x4f9be343 // sdot v3.4s, v26.16b, v27.4b[0]\n" - ".inst 0x4fbbe357 // sdot v23.4s, v26.16b, v27.4b[1]\n" - ".inst 0x4f9beb4a // sdot v10.4s, v26.16b, v27.4b[2]\n" - ".inst 0x4fbbeb42 // sdot v2.4s, v26.16b, v27.4b[3]\n" - ".inst 0x4f9de276 // sdot v22.4s, v19.16b, v29.4b[0]\n" - ".inst 0x4fbde26f // sdot v15.4s, v19.16b, v29.4b[1]\n" - ".inst 0x4f9dea6e // sdot v14.4s, v19.16b, v29.4b[2]\n" - ".inst 0x4fbdea7f // sdot v31.4s, v19.16b, v29.4b[3]\n" - ".inst 0x4f80e263 // sdot v3.4s, v19.16b, v0.4b[0]\n" - ".inst 0x4fa0e277 // sdot v23.4s, v19.16b, v0.4b[1]\n" - ".inst 0x4f80ea6a // sdot v10.4s, v19.16b, v0.4b[2]\n" - ".inst 0x4fa0ea62 // sdot v2.4s, v19.16b, v0.4b[3]\n" - ".inst 0x4f95e236 // sdot v22.4s, v17.16b, v21.4b[0]\n" - ".inst 0x4fb5e22f // sdot v15.4s, v17.16b, v21.4b[1]\n" - ".inst 0x4f95ea2e // sdot v14.4s, v17.16b, v21.4b[2]\n" - ".inst 0x4fb5ea3f // sdot v31.4s, v17.16b, v21.4b[3]\n" - ".inst 0x4f9ce223 // sdot v3.4s, v17.16b, v28.4b[0]\n" - ".inst 0x4fbce237 // sdot v23.4s, v17.16b, v28.4b[1]\n" - ".inst 0x4f9cea2a // sdot v10.4s, v17.16b, v28.4b[2]\n" - ".inst 0x4fbcea22 // sdot v2.4s, v17.16b, v28.4b[3]\n" - ".inst 0x4f87e296 // sdot v22.4s, v20.16b, v7.4b[0]\n" - ".inst 0x4fa7e28f // sdot v15.4s, v20.16b, v7.4b[1]\n" - ".inst 0x4f87ea8e // sdot v14.4s, v20.16b, v7.4b[2]\n" - ".inst 0x4fa7ea9f // sdot v31.4s, v20.16b, v7.4b[3]\n" - ".inst 0x4f88e283 // sdot v3.4s, v20.16b, v8.4b[0]\n" - ".inst 0x4fa8e297 // sdot v23.4s, v20.16b, v8.4b[1]\n" - ".inst 0x4f88ea8a // sdot v10.4s, v20.16b, v8.4b[2]\n" - ".inst 0x4fa8ea82 // sdot v2.4s, v20.16b, v8.4b[3]\n" - ".inst 0x4f85e316 // sdot v22.4s, v24.16b, v5.4b[0]\n" - ".inst 0x4fa5e30f // sdot v15.4s, v24.16b, v5.4b[1]\n" - ".inst 0x4f85eb0e // sdot v14.4s, v24.16b, v5.4b[2]\n" - ".inst 0x4fa5eb1f // sdot v31.4s, v24.16b, v5.4b[3]\n" - ".inst 0x4f92e303 // sdot v3.4s, v24.16b, v18.4b[0]\n" - ".inst 0x4fb2e317 // sdot v23.4s, v24.16b, v18.4b[1]\n" - ".inst 0x4f92eb0a // sdot v10.4s, v24.16b, v18.4b[2]\n" - ".inst 0x4fb2eb02 // sdot v2.4s, v24.16b, v18.4b[3]\n" + ".inst 0x4f81e346 // sdot v6.4s, v26.16b, v1.4b[0]\n" + ".inst 0x4fa1e349 // sdot v9.4s, v26.16b, v1.4b[1]\n" + ".inst 0x4f81eb54 // sdot v20.4s, v26.16b, v1.4b[2]\n" + ".inst 0x4fa1eb4b // sdot v11.4s, v26.16b, v1.4b[3]\n" + ".inst 0x4f9de351 // sdot v17.4s, v26.16b, v29.4b[0]\n" + ".inst 0x4fbde355 // sdot v21.4s, v26.16b, v29.4b[1]\n" + ".inst 0x4f9deb44 // sdot v4.4s, v26.16b, v29.4b[2]\n" + ".inst 0x4fbdeb5c // sdot v28.4s, v26.16b, v29.4b[3]\n" + "shl v26.16b, v27.16b, #0x4\n" + "and v27.16b, v27.16b, v13.16b\n" + ".inst 0x4f82e34f // sdot v15.4s, v26.16b, v2.4b[0]\n" + ".inst 0x4fa2e34c // sdot v12.4s, v26.16b, v2.4b[1]\n" + ".inst 0x4f82eb5e // sdot v30.4s, v26.16b, v2.4b[2]\n" + ".inst 0x4fa2eb4e // sdot v14.4s, v26.16b, v2.4b[3]\n" + ".inst 0x4f93e348 // sdot v8.4s, v26.16b, v19.4b[0]\n" + ".inst 0x4fb3e34a // sdot v10.4s, v26.16b, v19.4b[1]\n" + ".inst 0x4f93eb45 // sdot v5.4s, v26.16b, v19.4b[2]\n" + ".inst 0x4fb3eb43 // sdot v3.4s, v26.16b, v19.4b[3]\n" + ".inst 0x4f96e0ef // sdot v15.4s, v7.16b, v22.4b[0]\n" + ".inst 0x4fb6e0ec // sdot v12.4s, v7.16b, v22.4b[1]\n" + ".inst 0x4f96e8fe // sdot v30.4s, v7.16b, v22.4b[2]\n" + ".inst 0x4fb6e8ee // sdot v14.4s, v7.16b, v22.4b[3]\n" + ".inst 0x4f92e0e8 // sdot v8.4s, v7.16b, v18.4b[0]\n" + ".inst 0x4fb2e0ea // sdot v10.4s, v7.16b, v18.4b[1]\n" + ".inst 0x4f92e8e5 // sdot v5.4s, v7.16b, v18.4b[2]\n" + ".inst 0x4fb2e8e3 // sdot v3.4s, v7.16b, v18.4b[3]\n" + ".inst 0x4f98e20f // sdot v15.4s, v16.16b, v24.4b[0]\n" + ".inst 0x4fb8e20c // sdot v12.4s, v16.16b, v24.4b[1]\n" + ".inst 0x4f98ea1e // sdot v30.4s, v16.16b, v24.4b[2]\n" + ".inst 0x4fb8ea0e // sdot v14.4s, v16.16b, v24.4b[3]\n" + ".inst 0x4f97e208 // sdot v8.4s, v16.16b, v23.4b[0]\n" + ".inst 0x4fb7e20a // sdot v10.4s, v16.16b, v23.4b[1]\n" + ".inst 0x4f97ea05 // sdot v5.4s, v16.16b, v23.4b[2]\n" + ".inst 0x4fb7ea03 // sdot v3.4s, v16.16b, v23.4b[3]\n" + ".inst 0x4f80e32f // sdot v15.4s, v25.16b, v0.4b[0]\n" + ".inst 0x4fa0e32c // sdot v12.4s, v25.16b, v0.4b[1]\n" + ".inst 0x4f80eb3e // sdot v30.4s, v25.16b, v0.4b[2]\n" + ".inst 0x4fa0eb2e // sdot v14.4s, v25.16b, v0.4b[3]\n" + ".inst 0x4f9fe328 // sdot v8.4s, v25.16b, v31.4b[0]\n" + ".inst 0x4fbfe32a // sdot v10.4s, v25.16b, v31.4b[1]\n" + ".inst 0x4f9feb25 // sdot v5.4s, v25.16b, v31.4b[2]\n" + ".inst 0x4fbfeb23 // sdot v3.4s, v25.16b, v31.4b[3]\n" + ".inst 0x4f81e36f // sdot v15.4s, v27.16b, v1.4b[0]\n" + ".inst 0x4fa1e36c // sdot v12.4s, v27.16b, v1.4b[1]\n" + ".inst 0x4f81eb7e // sdot v30.4s, v27.16b, v1.4b[2]\n" + ".inst 0x4fa1eb6e // sdot v14.4s, v27.16b, v1.4b[3]\n" + ".inst 0x4f9de368 // sdot v8.4s, v27.16b, v29.4b[0]\n" + ".inst 0x4fbde36a // sdot v10.4s, v27.16b, v29.4b[1]\n" + ".inst 0x4f9deb65 // sdot v5.4s, v27.16b, v29.4b[2]\n" + ".inst 0x4fbdeb63 // sdot v3.4s, v27.16b, v29.4b[3]\n" "bgt 3b\n" - "ldr q28, [x10, #0x0]\n" - "ldr q26, [x10, #0x10]\n" - "ld1 { v18.4s }, [x22]\n" - "ldr q29, [x10, #0x20]\n" + "ldr q29, [x10, #0x0]\n" + "ldr q19, [x10, #0x10]\n" + "ld1 { v24.4s }, [x22]\n" + "ldr q1, [x10, #0x20]\n" "add x22, x22, #0x10\n" - "ldr q27, [x10, #0x30]\n" - "ldr q5, [x22, #0x0]\n" + "ldr q2, [x10, #0x30]\n" + "ldr q31, [x22, #0x0]\n" "add x10, x10, #0x40\n" - "mla v13.4s, v28.4s, v18.s[0]\n" - "mla v22.4s, v26.4s, v18.s[0]\n" - "mla v12.4s, v28.4s, v18.s[1]\n" - "mla v15.4s, v26.4s, v18.s[1]\n" - "mla v11.4s, v28.4s, v18.s[2]\n" - "mla v14.4s, v26.4s, v18.s[2]\n" - "mla v6.4s, v28.4s, v18.s[3]\n" - "fmul v17.4s, v29.4s, v5.s[0]\n" - "mla v31.4s, v26.4s, v18.s[3]\n" - "scvtf v13.4s, v13.4s\n" - "fmul v24.4s, v27.4s, v5.s[0]\n" - "scvtf v22.4s, v22.4s\n" - "fmul v21.4s, v29.4s, v5.s[1]\n" - "scvtf v12.4s, v12.4s\n" - "fmul v20.4s, v27.4s, v5.s[1]\n" + "mla v6.4s, v29.4s, v24.s[0]\n" + "mla v15.4s, v19.4s, v24.s[0]\n" + "mla v9.4s, v29.4s, v24.s[1]\n" + "mla v12.4s, v19.4s, v24.s[1]\n" + "mla v20.4s, v29.4s, v24.s[2]\n" + "mla v30.4s, v19.4s, v24.s[2]\n" + "mla v11.4s, v29.4s, v24.s[3]\n" + "fmul v7.4s, v1.4s, v31.s[0]\n" + "mla v14.4s, v19.4s, v24.s[3]\n" + "scvtf v6.4s, v6.4s\n" + "fmul v26.4s, v2.4s, v31.s[0]\n" "scvtf v15.4s, v15.4s\n" - "fmul v19.4s, v29.4s, v5.s[2]\n" + "fmul v24.4s, v1.4s, v31.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v23.4s, v2.4s, v31.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v25.4s, v1.4s, v31.s[2]\n" + "scvtf v20.4s, v20.4s\n" + "fmul v27.4s, v2.4s, v31.s[2]\n" + "scvtf v30.4s, v30.4s\n" + "fmul v22.4s, v1.4s, v31.s[3]\n" "scvtf v11.4s, v11.4s\n" - "fmul v18.4s, v27.4s, v5.s[2]\n" + "fmul v31.4s, v2.4s, v31.s[3]\n" "scvtf v14.4s, v14.4s\n" - "fmul v1.4s, v29.4s, v5.s[3]\n" - "scvtf v6.4s, v6.4s\n" - "fmul v8.4s, v27.4s, v5.s[3]\n" - "scvtf v31.4s, v31.4s\n" - "fmul v13.4s, v13.4s, v17.4s\n" - "fmul v22.4s, v22.4s, v24.4s\n" - "fmul v12.4s, v12.4s, v21.4s\n" - "fmul v15.4s, v15.4s, v20.4s\n" - "fmul v11.4s, v11.4s, v19.4s\n" - "fmul v14.4s, v14.4s, v18.4s\n" - "fmul v6.4s, v6.4s, v1.4s\n" - "fmul v31.4s, v31.4s, v8.4s\n" - "ld1 { v1.4s }, [x20]\n" + "fmul v6.4s, v6.4s, v7.4s\n" + "fmul v15.4s, v15.4s, v26.4s\n" + "fmul v9.4s, v9.4s, v24.4s\n" + "fmul v12.4s, v12.4s, v23.4s\n" + "fmul v20.4s, v20.4s, v25.4s\n" + "fmul v30.4s, v30.4s, v27.4s\n" + "fmul v11.4s, v11.4s, v22.4s\n" + "fmul v14.4s, v14.4s, v31.4s\n" + "ld1 { v25.4s }, [x20]\n" "add x20, x20, #0x10\n" - "ldr q24, [x20, #0x0]\n" - "mla v25.4s, v28.4s, v1.s[0]\n" - "mla v3.4s, v26.4s, v1.s[0]\n" - "mla v4.4s, v28.4s, v1.s[1]\n" - "mla v23.4s, v26.4s, v1.s[1]\n" - "mla v9.4s, v28.4s, v1.s[2]\n" - "mla v10.4s, v26.4s, v1.s[2]\n" - "mla v30.4s, v28.4s, v1.s[3]\n" - "fmul v28.4s, v29.4s, v24.s[0]\n" - "mla v2.4s, v26.4s, v1.s[3]\n" - "scvtf v25.4s, v25.4s\n" - "fmul v5.4s, v27.4s, v24.s[0]\n" - "scvtf v3.4s, v3.4s\n" - "fmul v26.4s, v29.4s, v24.s[1]\n" - "scvtf v4.4s, v4.4s\n" - "fmul v8.4s, v27.4s, v24.s[1]\n" - "scvtf v23.4s, v23.4s\n" - "fmul v21.4s, v29.4s, v24.s[2]\n" - "scvtf v9.4s, v9.4s\n" - "fmul v20.4s, v27.4s, v24.s[2]\n" + "ldr q0, [x20, #0x0]\n" + "mla v17.4s, v29.4s, v25.s[0]\n" + "mla v8.4s, v19.4s, v25.s[0]\n" + "mla v21.4s, v29.4s, v25.s[1]\n" + "mla v10.4s, v19.4s, v25.s[1]\n" + "mla v4.4s, v29.4s, v25.s[2]\n" + "mla v5.4s, v19.4s, v25.s[2]\n" + "mla v28.4s, v29.4s, v25.s[3]\n" + "fmul v26.4s, v1.4s, v0.s[0]\n" + "mla v3.4s, v19.4s, v25.s[3]\n" + "scvtf v17.4s, v17.4s\n" + "fmul v18.4s, v2.4s, v0.s[0]\n" + "scvtf v8.4s, v8.4s\n" + "fmul v24.4s, v1.4s, v0.s[1]\n" + "scvtf v21.4s, v21.4s\n" + "fmul v22.4s, v2.4s, v0.s[1]\n" "scvtf v10.4s, v10.4s\n" - "fmul v19.4s, v29.4s, v24.s[3]\n" - "scvtf v30.4s, v30.4s\n" - "fmul v18.4s, v27.4s, v24.s[3]\n" - "scvtf v2.4s, v2.4s\n" - "fmul v25.4s, v25.4s, v28.4s\n" - "fmul v3.4s, v3.4s, v5.4s\n" - "fmul v4.4s, v4.4s, v26.4s\n" - "fmul v23.4s, v23.4s, v8.4s\n" - "fmul v9.4s, v9.4s, v21.4s\n" - "fmul v10.4s, v10.4s, v20.4s\n" - "fmul v30.4s, v30.4s, v19.4s\n" - "fmul v2.4s, v2.4s, v18.4s\n" - "ld1r { v19.4s }, [%x[clamp_vals]]\n" + "fmul v27.4s, v1.4s, v0.s[2]\n" + "scvtf v4.4s, v4.4s\n" + "fmul v23.4s, v2.4s, v0.s[2]\n" + "scvtf v5.4s, v5.4s\n" + "fmul v25.4s, v1.4s, v0.s[3]\n" + "scvtf v28.4s, v28.4s\n" + "fmul v19.4s, v2.4s, v0.s[3]\n" + "scvtf v3.4s, v3.4s\n" + "fmul v17.4s, v17.4s, v26.4s\n" + "fmul v8.4s, v8.4s, v18.4s\n" + "fmul v21.4s, v21.4s, v24.4s\n" + "fmul v10.4s, v10.4s, v22.4s\n" + "fmul v4.4s, v4.4s, v27.4s\n" + "fmul v5.4s, v5.4s, v23.4s\n" + "fmul v28.4s, v28.4s, v25.4s\n" + "fmul v3.4s, v3.4s, v19.4s\n" + "ldr q2, [x10, #0x0]\n" + "ldr q22, [x10, #0x10]\n" "add x20, %x[clamp_vals], #0x4\n" "cmp x9, #0x8\n" - "ld1r { v18.4s }, [x20]\n" + "ld1r { v19.4s }, [%x[clamp_vals]]\n" + "ld1r { v7.4s }, [x20]\n" "add x10, x10, #0x20\n" - "fmax v13.4s, v13.4s, v19.4s\n" - "fmax v22.4s, v22.4s, v19.4s\n" - "fmax v12.4s, v12.4s, v19.4s\n" + "fadd v6.4s, v6.4s, v2.4s\n" + "fadd v15.4s, v15.4s, v22.4s\n" + "fadd v9.4s, v9.4s, v2.4s\n" + "fadd v12.4s, v12.4s, v22.4s\n" + "fadd v20.4s, v20.4s, v2.4s\n" + "fadd v30.4s, v30.4s, v22.4s\n" + "fadd v11.4s, v11.4s, v2.4s\n" + "fadd v14.4s, v14.4s, v22.4s\n" + "fadd v17.4s, v17.4s, v2.4s\n" + "fadd v8.4s, v8.4s, v22.4s\n" + "fadd v21.4s, v21.4s, v2.4s\n" + "fadd v10.4s, v10.4s, v22.4s\n" + "fadd v4.4s, v4.4s, v2.4s\n" + "fadd v5.4s, v5.4s, v22.4s\n" + "fadd v28.4s, v28.4s, v2.4s\n" + "fadd v3.4s, v3.4s, v22.4s\n" + "fmax v6.4s, v6.4s, v19.4s\n" "fmax v15.4s, v15.4s, v19.4s\n" + "fmax v9.4s, v9.4s, v19.4s\n" + "fmax v12.4s, v12.4s, v19.4s\n" + "fmax v20.4s, v20.4s, v19.4s\n" + "fmax v30.4s, v30.4s, v19.4s\n" "fmax v11.4s, v11.4s, v19.4s\n" "fmax v14.4s, v14.4s, v19.4s\n" - "fmax v6.4s, v6.4s, v19.4s\n" - "fmax v31.4s, v31.4s, v19.4s\n" - "fmax v25.4s, v25.4s, v19.4s\n" - "fmax v3.4s, v3.4s, v19.4s\n" - "fmax v4.4s, v4.4s, v19.4s\n" - "fmax v23.4s, v23.4s, v19.4s\n" - "fmax v9.4s, v9.4s, v19.4s\n" + "fmax v17.4s, v17.4s, v19.4s\n" + "fmax v8.4s, v8.4s, v19.4s\n" + "fmax v21.4s, v21.4s, v19.4s\n" "fmax v10.4s, v10.4s, v19.4s\n" - "fmax v30.4s, v30.4s, v19.4s\n" - "fmax v2.4s, v2.4s, v19.4s\n" - "fmin v13.4s, v13.4s, v18.4s\n" - "fmin v22.4s, v22.4s, v18.4s\n" - "fmin v12.4s, v12.4s, v18.4s\n" - "fmin v15.4s, v15.4s, v18.4s\n" - "fmin v11.4s, v11.4s, v18.4s\n" - "fmin v14.4s, v14.4s, v18.4s\n" - "fmin v6.4s, v6.4s, v18.4s\n" - "fmin v31.4s, v31.4s, v18.4s\n" - "fmin v25.4s, v25.4s, v18.4s\n" - "fmin v3.4s, v3.4s, v18.4s\n" - "fmin v4.4s, v4.4s, v18.4s\n" - "fmin v23.4s, v23.4s, v18.4s\n" - "fmin v9.4s, v9.4s, v18.4s\n" - "fmin v10.4s, v10.4s, v18.4s\n" - "fmin v30.4s, v30.4s, v18.4s\n" - "fmin v2.4s, v2.4s, v18.4s\n" + "fmax v4.4s, v4.4s, v19.4s\n" + "fmax v5.4s, v5.4s, v19.4s\n" + "fmax v28.4s, v28.4s, v19.4s\n" + "fmax v3.4s, v3.4s, v19.4s\n" + "fmin v6.4s, v6.4s, v7.4s\n" + "fmin v15.4s, v15.4s, v7.4s\n" + "fmin v9.4s, v9.4s, v7.4s\n" + "fmin v12.4s, v12.4s, v7.4s\n" + "fmin v20.4s, v20.4s, v7.4s\n" + "fmin v30.4s, v30.4s, v7.4s\n" + "fmin v11.4s, v11.4s, v7.4s\n" + "fmin v14.4s, v14.4s, v7.4s\n" + "fmin v17.4s, v17.4s, v7.4s\n" + "fmin v8.4s, v8.4s, v7.4s\n" + "fmin v21.4s, v21.4s, v7.4s\n" + "fmin v10.4s, v10.4s, v7.4s\n" + "fmin v4.4s, v4.4s, v7.4s\n" + "fmin v5.4s, v5.4s, v7.4s\n" + "fmin v28.4s, v28.4s, v7.4s\n" + "fmin v3.4s, v3.4s, v7.4s\n" "blt 6f\n" "mov x20, %x[dst]\n" - "str q13, [x20, #0x0]\n" - "str q22, [x20, #0x10]\n" - "add x20, x20, %x[dst_stride_row]\n" - "str q12, [x20, #0x0]\n" + "str q6, [x20, #0x0]\n" "str q15, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" + "str q9, [x20, #0x0]\n" + "str q12, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" + "str q20, [x20, #0x0]\n" + "str q30, [x20, #0x10]\n" + "add x20, x20, %x[dst_stride_row]\n" "str q11, [x20, #0x0]\n" "str q14, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" - "str q6, [x20, #0x0]\n" - "str q31, [x20, #0x10]\n" + "str q17, [x20, #0x0]\n" + "str q8, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" - "str q25, [x20, #0x0]\n" - "str q3, [x20, #0x10]\n" + "str q21, [x20, #0x0]\n" + "str q10, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" "str q4, [x20, #0x0]\n" - "str q23, [x20, #0x10]\n" - "add x20, x20, %x[dst_stride_row]\n" - "str q9, [x20, #0x0]\n" - "str q10, [x20, #0x10]\n" + "str q5, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" - "str q30, [x20, #0x0]\n" - "str q2, [x20, #0x10]\n" + "str q28, [x20, #0x0]\n" + "str q3, [x20, #0x10]\n" "b 11f\n" "6:" // Partial output "mov x27, %x[dst]\n" @@ -466,73 +484,73 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( "add x21, x27, %x[dst_stride_row]\n" "add x20, x22, %x[dst_stride_row]\n" "tbz x9, #2, 8f\n" - "st1 { v30.4s }, [x23], #0x10\n" - "st1 { v9.4s }, [x25], #0x10\n" - "st1 { v4.4s }, [x24], #0x10\n" - "st1 { v25.4s }, [x26], #0x10\n" - "st1 { v6.4s }, [x20], #0x10\n" - "st1 { v11.4s }, [x22], #0x10\n" - "st1 { v12.4s }, [x21], #0x10\n" - "st1 { v13.4s }, [x27], #0x10\n" + "st1 { v28.4s }, [x23], #0x10\n" + "st1 { v4.4s }, [x25], #0x10\n" + "st1 { v21.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x26], #0x10\n" + "st1 { v11.4s }, [x20], #0x10\n" + "st1 { v20.4s }, [x22], #0x10\n" + "st1 { v9.4s }, [x21], #0x10\n" + "st1 { v6.4s }, [x27], #0x10\n" "tbz x9, #1, 7f\n" - "st1 { v2.d }[0], [x23], #0x8\n" - "st1 { v10.d }[0], [x25], #0x8\n" - "st1 { v23.d }[0], [x24], #0x8\n" - "st1 { v3.d }[0], [x26], #0x8\n" - "st1 { v31.d }[0], [x20], #0x8\n" - "st1 { v14.d }[0], [x22], #0x8\n" - "st1 { v15.d }[0], [x21], #0x8\n" - "st1 { v22.d }[0], [x27], #0x8\n" + "st1 { v3.d }[0], [x23], #0x8\n" + "st1 { v5.d }[0], [x25], #0x8\n" + "st1 { v10.d }[0], [x24], #0x8\n" + "st1 { v8.d }[0], [x26], #0x8\n" + "st1 { v14.d }[0], [x20], #0x8\n" + "st1 { v30.d }[0], [x22], #0x8\n" + "st1 { v12.d }[0], [x21], #0x8\n" + "st1 { v15.d }[0], [x27], #0x8\n" "tbz x9, #0, 10f\n" - "st1 { v2.s }[2], [x23]\n" - "st1 { v10.s }[2], [x25]\n" - "st1 { v23.s }[2], [x24]\n" - "st1 { v3.s }[2], [x26]\n" - "st1 { v31.s }[2], [x20]\n" - "st1 { v14.s }[2], [x22]\n" - "st1 { v15.s }[2], [x21]\n" - "st1 { v22.s }[2], [x27]\n" + "st1 { v3.s }[2], [x23]\n" + "st1 { v5.s }[2], [x25]\n" + "st1 { v10.s }[2], [x24]\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v14.s }[2], [x20]\n" + "st1 { v30.s }[2], [x22]\n" + "st1 { v12.s }[2], [x21]\n" + "st1 { v15.s }[2], [x27]\n" "b 10f\n" "7:" // Output block 0: partial_1_4 "tbz x9, #0, 10f\n" - "st1 { v2.s }[0], [x23]\n" - "st1 { v10.s }[0], [x25]\n" - "st1 { v23.s }[0], [x24]\n" - "st1 { v3.s }[0], [x26]\n" - "st1 { v31.s }[0], [x20]\n" - "st1 { v14.s }[0], [x22]\n" - "st1 { v15.s }[0], [x21]\n" - "st1 { v22.s }[0], [x27]\n" + "st1 { v3.s }[0], [x23]\n" + "st1 { v5.s }[0], [x25]\n" + "st1 { v10.s }[0], [x24]\n" + "st1 { v8.s }[0], [x26]\n" + "st1 { v14.s }[0], [x20]\n" + "st1 { v30.s }[0], [x22]\n" + "st1 { v12.s }[0], [x21]\n" + "st1 { v15.s }[0], [x27]\n" "b 10f\n" "8:" // Output block 0: partial_2_0 "tbz x9, #1, 9f\n" - "st1 { v30.d }[0], [x23], #0x8\n" - "st1 { v9.d }[0], [x25], #0x8\n" - "st1 { v4.d }[0], [x24], #0x8\n" - "st1 { v25.d }[0], [x26], #0x8\n" - "st1 { v6.d }[0], [x20], #0x8\n" - "st1 { v11.d }[0], [x22], #0x8\n" - "st1 { v12.d }[0], [x21], #0x8\n" - "st1 { v13.d }[0], [x27], #0x8\n" + "st1 { v28.d }[0], [x23], #0x8\n" + "st1 { v4.d }[0], [x25], #0x8\n" + "st1 { v21.d }[0], [x24], #0x8\n" + "st1 { v17.d }[0], [x26], #0x8\n" + "st1 { v11.d }[0], [x20], #0x8\n" + "st1 { v20.d }[0], [x22], #0x8\n" + "st1 { v9.d }[0], [x21], #0x8\n" + "st1 { v6.d }[0], [x27], #0x8\n" "tbz x9, #0, 10f\n" - "st1 { v30.s }[2], [x23]\n" - "st1 { v9.s }[2], [x25]\n" - "st1 { v4.s }[2], [x24]\n" - "st1 { v25.s }[2], [x26]\n" - "st1 { v6.s }[2], [x20]\n" - "st1 { v11.s }[2], [x22]\n" - "st1 { v12.s }[2], [x21]\n" - "st1 { v13.s }[2], [x27]\n" + "st1 { v28.s }[2], [x23]\n" + "st1 { v4.s }[2], [x25]\n" + "st1 { v21.s }[2], [x24]\n" + "st1 { v17.s }[2], [x26]\n" + "st1 { v11.s }[2], [x20]\n" + "st1 { v20.s }[2], [x22]\n" + "st1 { v9.s }[2], [x21]\n" + "st1 { v6.s }[2], [x27]\n" "b 10f\n" "9:" // Output block 0: partial_1_0 - "st1 { v30.s }[0], [x23]\n" - "st1 { v9.s }[0], [x25]\n" - "st1 { v4.s }[0], [x24]\n" - "st1 { v25.s }[0], [x26]\n" - "st1 { v6.s }[0], [x20]\n" - "st1 { v11.s }[0], [x22]\n" - "st1 { v12.s }[0], [x21]\n" - "st1 { v13.s }[0], [x27]\n" + "st1 { v28.s }[0], [x23]\n" + "st1 { v4.s }[0], [x25]\n" + "st1 { v21.s }[0], [x24]\n" + "st1 { v17.s }[0], [x26]\n" + "st1 { v11.s }[0], [x20]\n" + "st1 { v20.s }[0], [x22]\n" + "st1 { v9.s }[0], [x21]\n" + "st1 { v6.s }[0], [x27]\n" "10:" // Output block 0: Done "11:" // Output stage exit "subs x9, x9, #0x8\n" @@ -552,196 +570,206 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( "add x24, %x[dst], %x[dst_stride_row], LSL #2\n" "14:" // Row tail: Column loop "mov x22, %x[lhs_packed]\n" - "movi v13.4s, #0x0\n" - "movi v22.4s, #0x0\n" + "movi v6.4s, #0x0\n" + "movi v15.4s, #0x0\n" "mov x20, %x[num_blocks]\n" + "movi v9.4s, #0x0\n" "movi v12.4s, #0x0\n" - "movi v15.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v30.4s, #0x0\n" "movi v11.4s, #0x0\n" "movi v14.4s, #0x0\n" - "movi v6.4s, #0x0\n" - "movi v31.4s, #0x0\n" "15:" // Row tail: Sub block loop - "ldr q17, [x26, #0x0]\n" + "ldr q10, [x26, #0x0]\n" "ldr q8, [x26, #0x10]\n" "subs x20, x20, #0x1\n" "ldr q7, [x22, #0x0]\n" "ldr q5, [x26, #0x20]\n" "ldr q4, [x26, #0x30]\n" "ldr q3, [x22, #0x10]\n" - "ldr q2, [x26, #0x40]\n" - "ldr q10, [x26, #0x50]\n" - "shl v21.16b, v17.16b, #0x4\n" + "ldr q17, [x26, #0x40]\n" + "ldr q1, [x26, #0x50]\n" + "shl v29.16b, v10.16b, #0x4\n" "shl v18.16b, v8.16b, #0x4\n" - "ldr q20, [x22, #0x20]\n" - "ldr q30, [x26, #0x60]\n" - "shl v24.16b, v5.16b, #0x4\n" - "and v17.16b, v17.16b, v16.16b\n" - "ldr q28, [x26, #0x70]\n" - "ldr q27, [x22, #0x30]\n" + "ldr q2, [x22, #0x20]\n" + "ldr q31, [x26, #0x60]\n" + "shl v27.16b, v5.16b, #0x4\n" + "and v10.16b, v10.16b, v13.16b\n" + "ldr q0, [x26, #0x70]\n" + "ldr q28, [x22, #0x30]\n" "shl v26.16b, v4.16b, #0x4\n" - "and v8.16b, v8.16b, v16.16b\n" - "ldr q0, [x22, #0x40]\n" - "ldr q19, [x22, #0x50]\n" - ".inst 0x4f87e2ad // sdot v13.4s, v21.16b, v7.4b[0]\n" - ".inst 0x4f87e256 // sdot v22.4s, v18.16b, v7.4b[0]\n" + "and v8.16b, v8.16b, v13.16b\n" + "ldr q25, [x22, #0x40]\n" + "ldr q24, [x22, #0x50]\n" + ".inst 0x4f87e3a6 // sdot v6.4s, v29.16b, v7.4b[0]\n" + ".inst 0x4f87e24f // sdot v15.4s, v18.16b, v7.4b[0]\n" "ldr q23, [x22, #0x60]\n" - "ldr q25, [x22, #0x70]\n" - ".inst 0x4fa7e2ac // sdot v12.4s, v21.16b, v7.4b[1]\n" - ".inst 0x4fa7e24f // sdot v15.4s, v18.16b, v7.4b[1]\n" - ".inst 0x4f87eaab // sdot v11.4s, v21.16b, v7.4b[2]\n" - ".inst 0x4f87ea4e // sdot v14.4s, v18.16b, v7.4b[2]\n" - "shl v29.16b, v2.16b, #0x4\n" + "ldr q22, [x22, #0x70]\n" + ".inst 0x4fa7e3a9 // sdot v9.4s, v29.16b, v7.4b[1]\n" + ".inst 0x4fa7e24c // sdot v12.4s, v18.16b, v7.4b[1]\n" + ".inst 0x4f87ebb4 // sdot v20.4s, v29.16b, v7.4b[2]\n" + ".inst 0x4f87ea5e // sdot v30.4s, v18.16b, v7.4b[2]\n" + "shl v21.16b, v17.16b, #0x4\n" "add x26, x26, #0x80\n" - ".inst 0x4fa7eaa6 // sdot v6.4s, v21.16b, v7.4b[3]\n" - ".inst 0x4fa7ea5f // sdot v31.4s, v18.16b, v7.4b[3]\n" - "shl v1.16b, v10.16b, #0x4\n" + ".inst 0x4fa7ebab // sdot v11.4s, v29.16b, v7.4b[3]\n" + ".inst 0x4fa7ea4e // sdot v14.4s, v18.16b, v7.4b[3]\n" + "shl v29.16b, v1.16b, #0x4\n" "add x22, x22, #0x80\n" - ".inst 0x4f83e30d // sdot v13.4s, v24.16b, v3.4b[0]\n" - ".inst 0x4f83e356 // sdot v22.4s, v26.16b, v3.4b[0]\n" - "shl v21.16b, v30.16b, #0x4\n" - ".inst 0x4fa3e30c // sdot v12.4s, v24.16b, v3.4b[1]\n" - ".inst 0x4fa3e34f // sdot v15.4s, v26.16b, v3.4b[1]\n" - "shl v18.16b, v28.16b, #0x4\n" - ".inst 0x4f83eb0b // sdot v11.4s, v24.16b, v3.4b[2]\n" - ".inst 0x4f83eb4e // sdot v14.4s, v26.16b, v3.4b[2]\n" - "and v5.16b, v5.16b, v16.16b\n" - ".inst 0x4fa3eb06 // sdot v6.4s, v24.16b, v3.4b[3]\n" - ".inst 0x4fa3eb5f // sdot v31.4s, v26.16b, v3.4b[3]\n" - "and v4.16b, v4.16b, v16.16b\n" - ".inst 0x4f94e3ad // sdot v13.4s, v29.16b, v20.4b[0]\n" - ".inst 0x4f94e036 // sdot v22.4s, v1.16b, v20.4b[0]\n" - "and v2.16b, v2.16b, v16.16b\n" - ".inst 0x4fb4e3ac // sdot v12.4s, v29.16b, v20.4b[1]\n" - ".inst 0x4fb4e02f // sdot v15.4s, v1.16b, v20.4b[1]\n" - "and v10.16b, v10.16b, v16.16b\n" - ".inst 0x4f94ebab // sdot v11.4s, v29.16b, v20.4b[2]\n" - ".inst 0x4f94e82e // sdot v14.4s, v1.16b, v20.4b[2]\n" - "and v30.16b, v30.16b, v16.16b\n" - ".inst 0x4fb4eba6 // sdot v6.4s, v29.16b, v20.4b[3]\n" - ".inst 0x4fb4e83f // sdot v31.4s, v1.16b, v20.4b[3]\n" - "and v28.16b, v28.16b, v16.16b\n" - ".inst 0x4f9be2ad // sdot v13.4s, v21.16b, v27.4b[0]\n" - ".inst 0x4f9be256 // sdot v22.4s, v18.16b, v27.4b[0]\n" - ".inst 0x4fbbe2ac // sdot v12.4s, v21.16b, v27.4b[1]\n" - ".inst 0x4fbbe24f // sdot v15.4s, v18.16b, v27.4b[1]\n" - ".inst 0x4f9beaab // sdot v11.4s, v21.16b, v27.4b[2]\n" - ".inst 0x4f9bea4e // sdot v14.4s, v18.16b, v27.4b[2]\n" - ".inst 0x4fbbeaa6 // sdot v6.4s, v21.16b, v27.4b[3]\n" - ".inst 0x4fbbea5f // sdot v31.4s, v18.16b, v27.4b[3]\n" - ".inst 0x4f80e22d // sdot v13.4s, v17.16b, v0.4b[0]\n" - ".inst 0x4f80e116 // sdot v22.4s, v8.16b, v0.4b[0]\n" - ".inst 0x4fa0e22c // sdot v12.4s, v17.16b, v0.4b[1]\n" - ".inst 0x4fa0e10f // sdot v15.4s, v8.16b, v0.4b[1]\n" - ".inst 0x4f80ea2b // sdot v11.4s, v17.16b, v0.4b[2]\n" - ".inst 0x4f80e90e // sdot v14.4s, v8.16b, v0.4b[2]\n" - ".inst 0x4fa0ea26 // sdot v6.4s, v17.16b, v0.4b[3]\n" - ".inst 0x4fa0e91f // sdot v31.4s, v8.16b, v0.4b[3]\n" - ".inst 0x4f93e0ad // sdot v13.4s, v5.16b, v19.4b[0]\n" - ".inst 0x4f93e096 // sdot v22.4s, v4.16b, v19.4b[0]\n" - ".inst 0x4fb3e0ac // sdot v12.4s, v5.16b, v19.4b[1]\n" - ".inst 0x4fb3e08f // sdot v15.4s, v4.16b, v19.4b[1]\n" - ".inst 0x4f93e8ab // sdot v11.4s, v5.16b, v19.4b[2]\n" - ".inst 0x4f93e88e // sdot v14.4s, v4.16b, v19.4b[2]\n" - ".inst 0x4fb3e8a6 // sdot v6.4s, v5.16b, v19.4b[3]\n" - ".inst 0x4fb3e89f // sdot v31.4s, v4.16b, v19.4b[3]\n" - ".inst 0x4f97e04d // sdot v13.4s, v2.16b, v23.4b[0]\n" - ".inst 0x4f97e156 // sdot v22.4s, v10.16b, v23.4b[0]\n" - ".inst 0x4fb7e04c // sdot v12.4s, v2.16b, v23.4b[1]\n" - ".inst 0x4fb7e14f // sdot v15.4s, v10.16b, v23.4b[1]\n" - ".inst 0x4f97e84b // sdot v11.4s, v2.16b, v23.4b[2]\n" - ".inst 0x4f97e94e // sdot v14.4s, v10.16b, v23.4b[2]\n" - ".inst 0x4fb7e846 // sdot v6.4s, v2.16b, v23.4b[3]\n" - ".inst 0x4fb7e95f // sdot v31.4s, v10.16b, v23.4b[3]\n" - ".inst 0x4f99e3cd // sdot v13.4s, v30.16b, v25.4b[0]\n" - ".inst 0x4f99e396 // sdot v22.4s, v28.16b, v25.4b[0]\n" - ".inst 0x4fb9e3cc // sdot v12.4s, v30.16b, v25.4b[1]\n" - ".inst 0x4fb9e38f // sdot v15.4s, v28.16b, v25.4b[1]\n" - ".inst 0x4f99ebcb // sdot v11.4s, v30.16b, v25.4b[2]\n" - ".inst 0x4f99eb8e // sdot v14.4s, v28.16b, v25.4b[2]\n" - ".inst 0x4fb9ebc6 // sdot v6.4s, v30.16b, v25.4b[3]\n" - ".inst 0x4fb9eb9f // sdot v31.4s, v28.16b, v25.4b[3]\n" + ".inst 0x4f83e366 // sdot v6.4s, v27.16b, v3.4b[0]\n" + ".inst 0x4f83e34f // sdot v15.4s, v26.16b, v3.4b[0]\n" + "shl v19.16b, v31.16b, #0x4\n" + ".inst 0x4fa3e369 // sdot v9.4s, v27.16b, v3.4b[1]\n" + ".inst 0x4fa3e34c // sdot v12.4s, v26.16b, v3.4b[1]\n" + "shl v18.16b, v0.16b, #0x4\n" + ".inst 0x4f83eb74 // sdot v20.4s, v27.16b, v3.4b[2]\n" + ".inst 0x4f83eb5e // sdot v30.4s, v26.16b, v3.4b[2]\n" + "and v5.16b, v5.16b, v13.16b\n" + ".inst 0x4fa3eb6b // sdot v11.4s, v27.16b, v3.4b[3]\n" + ".inst 0x4fa3eb4e // sdot v14.4s, v26.16b, v3.4b[3]\n" + "and v4.16b, v4.16b, v13.16b\n" + ".inst 0x4f82e2a6 // sdot v6.4s, v21.16b, v2.4b[0]\n" + ".inst 0x4f82e3af // sdot v15.4s, v29.16b, v2.4b[0]\n" + "and v17.16b, v17.16b, v13.16b\n" + ".inst 0x4fa2e2a9 // sdot v9.4s, v21.16b, v2.4b[1]\n" + ".inst 0x4fa2e3ac // sdot v12.4s, v29.16b, v2.4b[1]\n" + "and v1.16b, v1.16b, v13.16b\n" + ".inst 0x4f82eab4 // sdot v20.4s, v21.16b, v2.4b[2]\n" + ".inst 0x4f82ebbe // sdot v30.4s, v29.16b, v2.4b[2]\n" + "and v31.16b, v31.16b, v13.16b\n" + ".inst 0x4fa2eaab // sdot v11.4s, v21.16b, v2.4b[3]\n" + ".inst 0x4fa2ebae // sdot v14.4s, v29.16b, v2.4b[3]\n" + "and v0.16b, v0.16b, v13.16b\n" + ".inst 0x4f9ce266 // sdot v6.4s, v19.16b, v28.4b[0]\n" + ".inst 0x4f9ce24f // sdot v15.4s, v18.16b, v28.4b[0]\n" + ".inst 0x4fbce269 // sdot v9.4s, v19.16b, v28.4b[1]\n" + ".inst 0x4fbce24c // sdot v12.4s, v18.16b, v28.4b[1]\n" + ".inst 0x4f9cea74 // sdot v20.4s, v19.16b, v28.4b[2]\n" + ".inst 0x4f9cea5e // sdot v30.4s, v18.16b, v28.4b[2]\n" + ".inst 0x4fbcea6b // sdot v11.4s, v19.16b, v28.4b[3]\n" + ".inst 0x4fbcea4e // sdot v14.4s, v18.16b, v28.4b[3]\n" + ".inst 0x4f99e146 // sdot v6.4s, v10.16b, v25.4b[0]\n" + ".inst 0x4f99e10f // sdot v15.4s, v8.16b, v25.4b[0]\n" + ".inst 0x4fb9e149 // sdot v9.4s, v10.16b, v25.4b[1]\n" + ".inst 0x4fb9e10c // sdot v12.4s, v8.16b, v25.4b[1]\n" + ".inst 0x4f99e954 // sdot v20.4s, v10.16b, v25.4b[2]\n" + ".inst 0x4f99e91e // sdot v30.4s, v8.16b, v25.4b[2]\n" + ".inst 0x4fb9e94b // sdot v11.4s, v10.16b, v25.4b[3]\n" + ".inst 0x4fb9e90e // sdot v14.4s, v8.16b, v25.4b[3]\n" + ".inst 0x4f98e0a6 // sdot v6.4s, v5.16b, v24.4b[0]\n" + ".inst 0x4f98e08f // sdot v15.4s, v4.16b, v24.4b[0]\n" + ".inst 0x4fb8e0a9 // sdot v9.4s, v5.16b, v24.4b[1]\n" + ".inst 0x4fb8e08c // sdot v12.4s, v4.16b, v24.4b[1]\n" + ".inst 0x4f98e8b4 // sdot v20.4s, v5.16b, v24.4b[2]\n" + ".inst 0x4f98e89e // sdot v30.4s, v4.16b, v24.4b[2]\n" + ".inst 0x4fb8e8ab // sdot v11.4s, v5.16b, v24.4b[3]\n" + ".inst 0x4fb8e88e // sdot v14.4s, v4.16b, v24.4b[3]\n" + ".inst 0x4f97e226 // sdot v6.4s, v17.16b, v23.4b[0]\n" + ".inst 0x4f97e02f // sdot v15.4s, v1.16b, v23.4b[0]\n" + ".inst 0x4fb7e229 // sdot v9.4s, v17.16b, v23.4b[1]\n" + ".inst 0x4fb7e02c // sdot v12.4s, v1.16b, v23.4b[1]\n" + ".inst 0x4f97ea34 // sdot v20.4s, v17.16b, v23.4b[2]\n" + ".inst 0x4f97e83e // sdot v30.4s, v1.16b, v23.4b[2]\n" + ".inst 0x4fb7ea2b // sdot v11.4s, v17.16b, v23.4b[3]\n" + ".inst 0x4fb7e82e // sdot v14.4s, v1.16b, v23.4b[3]\n" + ".inst 0x4f96e3e6 // sdot v6.4s, v31.16b, v22.4b[0]\n" + ".inst 0x4f96e00f // sdot v15.4s, v0.16b, v22.4b[0]\n" + ".inst 0x4fb6e3e9 // sdot v9.4s, v31.16b, v22.4b[1]\n" + ".inst 0x4fb6e00c // sdot v12.4s, v0.16b, v22.4b[1]\n" + ".inst 0x4f96ebf4 // sdot v20.4s, v31.16b, v22.4b[2]\n" + ".inst 0x4f96e81e // sdot v30.4s, v0.16b, v22.4b[2]\n" + ".inst 0x4fb6ebeb // sdot v11.4s, v31.16b, v22.4b[3]\n" + ".inst 0x4fb6e80e // sdot v14.4s, v0.16b, v22.4b[3]\n" "bgt 15b\n" "ldr q21, [x26, #0x0]\n" - "ldr q20, [x26, #0x10]\n" - "ld1 { v5.4s }, [x22]\n" - "ldr q18, [x26, #0x20]\n" + "ldr q4, [x26, #0x10]\n" + "ld1 { v19.4s }, [x22]\n" + "ldr q25, [x26, #0x20]\n" "add x22, x22, #0x10\n" - "ldr q29, [x26, #0x30]\n" - "ldr q19, [x22, #0x0]\n" + "ldr q24, [x26, #0x30]\n" + "ldr q18, [x22, #0x0]\n" "add x26, x26, #0x40\n" - "mla v13.4s, v21.4s, v5.s[0]\n" - "mla v22.4s, v20.4s, v5.s[0]\n" - "mla v12.4s, v21.4s, v5.s[1]\n" - "mla v15.4s, v20.4s, v5.s[1]\n" - "mla v11.4s, v21.4s, v5.s[2]\n" - "mla v14.4s, v20.4s, v5.s[2]\n" - "mla v6.4s, v21.4s, v5.s[3]\n" - "fmul v24.4s, v18.4s, v19.s[0]\n" - "mla v31.4s, v20.4s, v5.s[3]\n" - "scvtf v13.4s, v13.4s\n" - "fmul v25.4s, v29.4s, v19.s[0]\n" - "scvtf v22.4s, v22.4s\n" - "fmul v21.4s, v18.4s, v19.s[1]\n" - "scvtf v12.4s, v12.4s\n" - "fmul v20.4s, v29.4s, v19.s[1]\n" + "mla v6.4s, v21.4s, v19.s[0]\n" + "mla v15.4s, v4.4s, v19.s[0]\n" + "mla v9.4s, v21.4s, v19.s[1]\n" + "mla v12.4s, v4.4s, v19.s[1]\n" + "mla v20.4s, v21.4s, v19.s[2]\n" + "mla v30.4s, v4.4s, v19.s[2]\n" + "mla v11.4s, v21.4s, v19.s[3]\n" + "fmul v28.4s, v25.4s, v18.s[0]\n" + "mla v14.4s, v4.4s, v19.s[3]\n" + "scvtf v6.4s, v6.4s\n" + "fmul v22.4s, v24.4s, v18.s[0]\n" "scvtf v15.4s, v15.4s\n" - "fmul v9.4s, v18.4s, v19.s[2]\n" + "fmul v21.4s, v25.4s, v18.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v1.4s, v24.4s, v18.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v19.4s, v25.4s, v18.s[2]\n" + "scvtf v20.4s, v20.4s\n" + "fmul v10.4s, v24.4s, v18.s[2]\n" + "scvtf v30.4s, v30.4s\n" + "fmul v23.4s, v25.4s, v18.s[3]\n" "scvtf v11.4s, v11.4s\n" - "fmul v4.4s, v29.4s, v19.s[2]\n" + "fmul v2.4s, v24.4s, v18.s[3]\n" "scvtf v14.4s, v14.4s\n" - "fmul v5.4s, v18.4s, v19.s[3]\n" - "scvtf v6.4s, v6.4s\n" - "fmul v27.4s, v29.4s, v19.s[3]\n" - "scvtf v31.4s, v31.4s\n" - "fmul v13.4s, v13.4s, v24.4s\n" - "fmul v22.4s, v22.4s, v25.4s\n" - "fmul v12.4s, v12.4s, v21.4s\n" - "fmul v15.4s, v15.4s, v20.4s\n" - "fmul v11.4s, v11.4s, v9.4s\n" - "fmul v14.4s, v14.4s, v4.4s\n" - "fmul v6.4s, v6.4s, v5.4s\n" - "fmul v31.4s, v31.4s, v27.4s\n" - "ld1r { v7.4s }, [%x[clamp_vals]]\n" + "fmul v6.4s, v6.4s, v28.4s\n" + "fmul v15.4s, v15.4s, v22.4s\n" + "fmul v9.4s, v9.4s, v21.4s\n" + "fmul v12.4s, v12.4s, v1.4s\n" + "fmul v20.4s, v20.4s, v19.4s\n" + "fmul v30.4s, v30.4s, v10.4s\n" + "fmul v11.4s, v11.4s, v23.4s\n" + "fmul v14.4s, v14.4s, v2.4s\n" + "ldr q19, [x26, #0x0]\n" + "ldr q18, [x26, #0x10]\n" "add x20, %x[clamp_vals], #0x4\n" "cmp x25, #0x8\n" - "ld1r { v29.4s }, [x20]\n" + "ld1r { v25.4s }, [%x[clamp_vals]]\n" + "ld1r { v26.4s }, [x20]\n" "add x26, x26, #0x20\n" - "fmax v13.4s, v13.4s, v7.4s\n" - "fmax v22.4s, v22.4s, v7.4s\n" - "fmax v12.4s, v12.4s, v7.4s\n" - "fmax v15.4s, v15.4s, v7.4s\n" - "fmax v11.4s, v11.4s, v7.4s\n" - "fmax v14.4s, v14.4s, v7.4s\n" - "fmax v6.4s, v6.4s, v7.4s\n" - "fmax v31.4s, v31.4s, v7.4s\n" - "fmin v13.4s, v13.4s, v29.4s\n" - "fmin v22.4s, v22.4s, v29.4s\n" - "fmin v12.4s, v12.4s, v29.4s\n" - "fmin v15.4s, v15.4s, v29.4s\n" - "fmin v11.4s, v11.4s, v29.4s\n" - "fmin v14.4s, v14.4s, v29.4s\n" - "fmin v6.4s, v6.4s, v29.4s\n" - "fmin v31.4s, v31.4s, v29.4s\n" + "fadd v6.4s, v6.4s, v19.4s\n" + "fadd v15.4s, v15.4s, v18.4s\n" + "fadd v9.4s, v9.4s, v19.4s\n" + "fadd v12.4s, v12.4s, v18.4s\n" + "fadd v20.4s, v20.4s, v19.4s\n" + "fadd v30.4s, v30.4s, v18.4s\n" + "fadd v11.4s, v11.4s, v19.4s\n" + "fadd v14.4s, v14.4s, v18.4s\n" + "fmax v6.4s, v6.4s, v25.4s\n" + "fmax v15.4s, v15.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v20.4s, v20.4s, v25.4s\n" + "fmax v30.4s, v30.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmin v6.4s, v6.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v30.4s, v30.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" "blt 17f\n" "mov x20, %x[dst]\n" "cmp x12, #0x1\n" - "str q13, [x20, #0x0]\n" - "str q22, [x20, #0x10]\n" + "str q6, [x20, #0x0]\n" + "str q15, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" "ble 22f\n" "cmp x12, #0x2\n" - "str q12, [x20, #0x0]\n" - "str q15, [x20, #0x10]\n" + "str q9, [x20, #0x0]\n" + "str q12, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" "ble 22f\n" "cmp x12, #0x3\n" - "str q11, [x20, #0x0]\n" - "str q14, [x20, #0x10]\n" + "str q20, [x20, #0x0]\n" + "str q30, [x20, #0x10]\n" "add x20, x20, %x[dst_stride_row]\n" "ble 22f\n" - "str q6, [x20, #0x0]\n" - "str q31, [x20, #0x10]\n" + "str q11, [x20, #0x0]\n" + "str q14, [x20, #0x10]\n" "b 22f\n" "17:" // Row tail: Partial output "mov x23, %x[dst]\n" @@ -755,45 +783,45 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( "add x20, x21, %x[dst_stride_row]\n" "csel x20, x20, x21, GT\n" "tbz x25, #2, 19f\n" - "st1 { v6.4s }, [x20], #0x10\n" - "st1 { v11.4s }, [x21], #0x10\n" - "st1 { v12.4s }, [x22], #0x10\n" - "st1 { v13.4s }, [x23], #0x10\n" + "st1 { v11.4s }, [x20], #0x10\n" + "st1 { v20.4s }, [x21], #0x10\n" + "st1 { v9.4s }, [x22], #0x10\n" + "st1 { v6.4s }, [x23], #0x10\n" "tbz x25, #1, 18f\n" - "st1 { v31.d }[0], [x20], #0x8\n" - "st1 { v14.d }[0], [x21], #0x8\n" - "st1 { v15.d }[0], [x22], #0x8\n" - "st1 { v22.d }[0], [x23], #0x8\n" + "st1 { v14.d }[0], [x20], #0x8\n" + "st1 { v30.d }[0], [x21], #0x8\n" + "st1 { v12.d }[0], [x22], #0x8\n" + "st1 { v15.d }[0], [x23], #0x8\n" "tbz x25, #0, 21f\n" - "st1 { v31.s }[2], [x20]\n" - "st1 { v14.s }[2], [x21]\n" - "st1 { v15.s }[2], [x22]\n" - "st1 { v22.s }[2], [x23]\n" + "st1 { v14.s }[2], [x20]\n" + "st1 { v30.s }[2], [x21]\n" + "st1 { v12.s }[2], [x22]\n" + "st1 { v15.s }[2], [x23]\n" "b 21f\n" "18:" // Row tail: Output block 0: partial_1_4 "tbz x25, #0, 21f\n" - "st1 { v31.s }[0], [x20]\n" - "st1 { v14.s }[0], [x21]\n" - "st1 { v15.s }[0], [x22]\n" - "st1 { v22.s }[0], [x23]\n" + "st1 { v14.s }[0], [x20]\n" + "st1 { v30.s }[0], [x21]\n" + "st1 { v12.s }[0], [x22]\n" + "st1 { v15.s }[0], [x23]\n" "b 21f\n" "19:" // Row tail: Output block 0: partial_2_0 "tbz x25, #1, 20f\n" - "st1 { v6.d }[0], [x20], #0x8\n" - "st1 { v11.d }[0], [x21], #0x8\n" - "st1 { v12.d }[0], [x22], #0x8\n" - "st1 { v13.d }[0], [x23], #0x8\n" + "st1 { v11.d }[0], [x20], #0x8\n" + "st1 { v20.d }[0], [x21], #0x8\n" + "st1 { v9.d }[0], [x22], #0x8\n" + "st1 { v6.d }[0], [x23], #0x8\n" "tbz x25, #0, 21f\n" - "st1 { v6.s }[2], [x20]\n" - "st1 { v11.s }[2], [x21]\n" - "st1 { v12.s }[2], [x22]\n" - "st1 { v13.s }[2], [x23]\n" + "st1 { v11.s }[2], [x20]\n" + "st1 { v20.s }[2], [x21]\n" + "st1 { v9.s }[2], [x22]\n" + "st1 { v6.s }[2], [x23]\n" "b 21f\n" "20:" // Row tail: Output block 0: partial_1_0 - "st1 { v6.s }[0], [x20]\n" - "st1 { v11.s }[0], [x21]\n" - "st1 { v12.s }[0], [x22]\n" - "st1 { v13.s }[0], [x23]\n" + "st1 { v11.s }[0], [x20]\n" + "st1 { v20.s }[0], [x21]\n" + "st1 { v9.s }[0], [x22]\n" + "st1 { v6.s }[0], [x23]\n" "21:" // Row tail: Output block 0: Done "22:" // Row tail: Output stage exit "subs x25, x25, #0x8\n" -- GitLab From 1a4aedf24fdd1cf89b4d1cbd9eaa0db07adfbcc9 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 25 Sep 2024 14:49:18 +0100 Subject: [PATCH 05/12] Add int4 dotprod matmul ukernels to the unit-tests Signed-off-by: Anitha Raj --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index fabb6611..44c20bed 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -17,7 +17,9 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h" @@ -34,10 +36,12 @@ namespace kai::test { -static const std::array, 6> +static const std::array, 8> variants_kai_matmul_clamp_f32_qai8dxp_qsi4cxp = {{ UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod, cpu_has_dotprod), UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod, cpu_has_dotprod), + UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod), + UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod), UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm, cpu_has_i8mm), UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm, cpu_has_i8mm), UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm, cpu_has_i8mm), -- GitLab From 0e3add4df1fd19ad8d76f9d0aa283ef059cb67e3 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Thu, 26 Sep 2024 15:19:11 +0100 Subject: [PATCH 06/12] Add update to Changelog.md Signed-off-by: Anitha Raj --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 123686b0..df1b6222 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 renamed to kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0. - Remove FP16 GEMV micro-kernel optimized for Advanced SIMD. - Where a dedicated GEMV micro-kernel is not provided, it is recommended to use existing GEMM micro-kernels which have dedicated paths for M=1 (a "GEMV" operation). +- Micro-kernels to compute the matrix multiplication of dynamically quantized 8-bit integer (QAI8DX) activations and quantized 4-bit integer (QSI4CX) weights and the accumulation of the result into a single-precision (F32) output, optimized using the Arm® CPU feature FEAT_DotProd. ## v0.2.0 -- GitLab From 21d3afc9fb597c667c2f5564b0db139ec8e40bf1 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Thu, 26 Sep 2024 15:23:54 +0100 Subject: [PATCH 07/12] Remove empty lines Signed-off-by: Anitha Raj --- ...matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h | 1 - ..._matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h | 1 - 2 files changed, 2 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h index 8b009bb0..ac1d8272 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h @@ -123,7 +123,6 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotp /// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( size_t m, size_t n, size_t k, - const void* lhs_packed, // const void* rhs_packed, // float* dst, // diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h index c6f36e29..0bd4796b 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h @@ -123,7 +123,6 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotpr /// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( size_t m, size_t n, size_t k, - const void* lhs_packed, // const void* rhs_packed, // float* dst, // -- GitLab From c3629be5ade625d6f2b1ce3c91da24bcc4f1ad28 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Fri, 27 Sep 2024 11:28:34 +0100 Subject: [PATCH 08/12] Build fix: Add checks for CPU feature in unit test after rebase Signed-off-by: Anitha Raj --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 44c20bed..5be8bb2c 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -40,8 +40,8 @@ static const std::array Date: Mon, 30 Sep 2024 15:00:48 +0100 Subject: [PATCH 09/12] Move the update to Upcoming Release in the Changelog Signed-off-by: Anitha Raj --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df1b6222..bc09e527 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification for releases. +## v0.4.0 -- Upcoming Release + +- Micro-kernels to compute the matrix multiplication of dynamically quantized 8-bit integer (QAI8DX) activations and quantized 4-bit integer (QSI4CX) weights and the accumulation of the result into a single-precision (F32) output, optimized using the Arm® CPU feature FEAT_DotProd. + ## v0.3.0 - Advanced SIMD FP32 GEMM micro-kernel. @@ -19,7 +23,6 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 renamed to kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0. - Remove FP16 GEMV micro-kernel optimized for Advanced SIMD. - Where a dedicated GEMV micro-kernel is not provided, it is recommended to use existing GEMM micro-kernels which have dedicated paths for M=1 (a "GEMV" operation). -- Micro-kernels to compute the matrix multiplication of dynamically quantized 8-bit integer (QAI8DX) activations and quantized 4-bit integer (QSI4CX) weights and the accumulation of the result into a single-precision (F32) output, optimized using the Arm® CPU feature FEAT_DotProd. ## v0.2.0 -- GitLab From 57c71b26040d57999090330c692352b7a67c7609 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 8 Oct 2024 17:25:25 +0100 Subject: [PATCH 10/12] Fix names of the int4 matmul ukernels * Rename kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod to kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod * Rename kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod to kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod Signed-off-by: Anitha Raj --- CMakeLists.txt | 4 +- .../CMakeLists.txt | 4 +- .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp | 52 +++++++++---------- kai/ukernels/matmul/BUILD.bazel | 16 +++--- ...8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c} | 24 ++++----- ...8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h} | 41 ++++++++------- ...i8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c} | 24 ++++----- ...i8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h} | 35 +++++++------ .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 8 +-- 9 files changed, 105 insertions(+), 103 deletions(-) rename kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/{kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c => kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c} (97%) rename kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/{kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h => kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h} (81%) rename kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/{kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c => kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c} (97%) rename kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/{kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h => kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h} (83%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 558c5011..ee35f8e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,8 @@ set(KLEIDIAI_FILES_NEON set(KLEIDIAI_FILES_NEON_DOTPROD kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c - kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c - kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c + kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c + kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod.c diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt index 7518d84e..02dd1ede 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt @@ -29,8 +29,8 @@ add_executable(matmul_clamp_f32_qai8dxp_qsi4cxp ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c - ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c - ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c + ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c ) # Compile with DotProd and I8MM features enabled diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp index e8db11cd..057f8eb5 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp @@ -18,11 +18,11 @@ #include "kai_lhs_quant_pack_qai8dxp_f32.h" #include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h" -#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h" -#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h" #include "kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" #include "kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.h" @@ -115,30 +115,30 @@ kai_matmul_ukernel_f32_qa8dxp_qs4cxp ukernel_variants[] = { kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm, kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm, "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm"}, - {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod, - "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod"}, - {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod, - "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod"}, }; diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 59c00e0c..6a088107 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -72,9 +72,9 @@ kai_c_library( ) kai_c_library( - name = "clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod", - srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c"], - hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h"], + name = "clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod", + srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c"], + hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h"], cpu_uarch = kai_cpu_dotprod(), deps = [ ":clamp_f32_qai8dxp_qsi4cxp_interface", @@ -82,9 +82,9 @@ kai_c_library( ) kai_c_library( - name = "clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod", - srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c"], - hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h"], + name = "clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod", + srcs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c"], + hdrs = ["matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h"], cpu_uarch = kai_cpu_dotprod(), deps = [ ":clamp_f32_qai8dxp_qsi4cxp_interface", @@ -301,11 +301,11 @@ kai_c_library( ":clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm", ":clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm", ":clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm", + ":clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod", ":clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm", ":clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm", - ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod", + ":clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod", ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm", - ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod", ":clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm", ":clamp_f32_qai8dxp_qsi4c32p_interface", ":clamp_f32_qsi8d32p_qsi4c32p_dotprod", diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c similarity index 97% rename from kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c rename to kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c index 0dea488e..130b86aa 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.c @@ -7,7 +7,7 @@ #if !defined(__ARM_FEATURE_DOTPROD) #error "Dotprod extension required to compile this micro-kernel" #else // Architectural features check. -#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h" #include #include @@ -50,43 +50,43 @@ inline static size_t kai_rhs_packed_stride(size_t k) { return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias); } -size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_m_step; } -size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_n_step; } -size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_mr; } -size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_nr; } -size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_kr; } -size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void) { +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void) { return kai_sr; } -size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m_idx, size_t k) { +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); } -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t n_idx, size_t k) { +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); } -size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t m_idx, size_t n_idx, size_t dst_stride) { KAI_ASSERT((m_idx % kai_m_step) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); @@ -94,11 +94,11 @@ size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_do return (n_idx * sizeof(float)) + m_idx * dst_stride; } -size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m, size_t n) { +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t m, size_t n) { return m * n * sizeof(float); } -void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t m, size_t n, size_t k, const void* restrict lhs_packed, const void* restrict rhs_packed, float* restrict dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) { KAI_ASSERT(dst_stride_col == sizeof(float)); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h similarity index 81% rename from kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h rename to kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h index ac1d8272..3227e1e2 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h @@ -15,7 +15,7 @@ extern "C" { /// Micro-kernel dependencies /// /// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix -/// -# kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix +/// -# kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0 to pack the RHS matrix /// -------------------------------------------------- @@ -24,37 +24,37 @@ extern "C" { /// be processed must be a multiple of m step. /// /// @return the m step value -size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the n step value. /// The micro-kernel can process any N values. However, the starting N index to /// be processed must be a multiple of n step. /// /// @return the n step -size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the mr value, which must be used to pack the LHS matrix /// /// @return the mr value -size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the nr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the nr value -size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the kr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the kr value -size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the sr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the sr value -size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(void); +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(void); /// Gets the offset in bytes for the packed LHS matrix, /// which contains the packed Signed 8-bit quantized asymmetric per-row (qai8dxp) values. @@ -65,27 +65,27 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(vo /// @param[in] k Total number of columns in the LHS matrix (not packed). /// /// @return the offset in bytes to the packed LHS matrix -size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m_idx, size_t k); +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t m_idx, size_t k); /// Gets the offset in bytes for the packed RHS matrix, /// which contains the packed Signed 4-bit quantized symmetric per-channel (qsi4cxp) values. /// -/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4. /// @param[in] k The common dimension between the LHS and RHS matrix (K). /// /// @return the offset in bytes to the packed RHS matrix -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t n_idx, // size_t k); /// Gets the offset in bytes for the DST matrix /// /// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 16. -/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. /// @param[in] dst_stride The number of bytes in in each row of the DST matrix /// /// @return the DST offset in bytes -size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t m_idx, // size_t n_idx, // size_t dst_stride); @@ -96,13 +96,13 @@ size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_do /// @param[in] n Number of columns in the destination (DST) matrix. /// /// @return the destination (DST) matrix size in bytes -size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod(size_t m, size_t n); +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod(size_t m, size_t n); /// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. /// /// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dxp) and packed /// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cxp) and packed. -/// Output tile: (rows x cols) = 16 x 8 +/// Output tile: (rows x cols) = 16 x 4 /// Accumulation performed in a single for loop: 32 /// Extension used: dotprod /// @@ -114,15 +114,16 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotp /// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs /// both the dynamic quantization to 8-bit and activation packing in a single step. /// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref -/// kai_run_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 -/// OR kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// kai_run_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 +/// OR kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0 /// @param[out] dst The DST matrix. /// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. /// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. It must be sizeof(float). /// @param[in] scalar_min Min value used to clamp the final result. /// @param[in] scalar_max Max value used to clamp the final result. -void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod( +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t m, size_t n, size_t k, + const void* lhs_packed, // const void* rhs_packed, // float* dst, // diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c similarity index 97% rename from kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c rename to kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c index 3d574c23..d0549134 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.c @@ -7,7 +7,7 @@ #if !defined(__ARM_FEATURE_DOTPROD) #error "Dotprod extension required to compile this micro-kernel" #else // Architectural features check. -#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" +#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h" #include #include @@ -50,43 +50,43 @@ inline static size_t kai_rhs_packed_stride(size_t k) { return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias); } -size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_m_step; } -size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_n_step; } -size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_mr; } -size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_nr; } -size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_kr; } -size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void) { +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void) { return kai_sr; } -size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m_idx, size_t k) { +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(size_t m_idx, size_t k) { KAI_ASSERT((m_idx % kai_m_step) == 0); return (m_idx / kai_m_step) * kai_lhs_packed_stride(k); } -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t n_idx, size_t k) { +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(size_t n_idx, size_t k) { KAI_ASSERT((n_idx % kai_n_step) == 0); return (n_idx / kai_n_step) * kai_rhs_packed_stride(k); } -size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t m_idx, size_t n_idx, size_t dst_stride) { KAI_ASSERT((m_idx % kai_m_step) == 0); KAI_ASSERT((n_idx % kai_n_step) == 0); @@ -94,11 +94,11 @@ size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dot return (n_idx * sizeof(float)) + m_idx * dst_stride; } -size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m, size_t n) { +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(size_t m, size_t n) { return m * n * sizeof(float); } -void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t m, size_t n, size_t k, const void* restrict lhs_packed, const void* restrict rhs_packed, float* restrict dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max) { KAI_ASSERT(dst_stride_col == sizeof(float)); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h similarity index 83% rename from kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h rename to kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h index 0bd4796b..5d827392 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h @@ -15,7 +15,7 @@ extern "C" { /// Micro-kernel dependencies /// /// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix -/// -# kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix +/// -# kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 OR kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0 to pack the RHS matrix /// -------------------------------------------------- @@ -24,37 +24,37 @@ extern "C" { /// be processed must be a multiple of m step. /// /// @return the m step value -size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the n step value. /// The micro-kernel can process any N values. However, the starting N index to /// be processed must be a multiple of n step. /// /// @return the n step -size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the mr value, which must be used to pack the LHS matrix /// /// @return the mr value -size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the nr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the nr value -size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the kr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the kr value -size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the sr value, which must be used to pack the RHS matrix with -/// the @ref kai_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 micro-kernel +/// the @ref kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 micro-kernel /// /// @return the sr value -size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(void); +size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(void); /// Gets the offset in bytes for the packed LHS matrix, /// which contains the packed Signed 8-bit quantized asymmetric per-row (qai8dxp) values. @@ -65,7 +65,7 @@ size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(voi /// @param[in] k Total number of columns in the LHS matrix (not packed). /// /// @return the offset in bytes to the packed LHS matrix -size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m_idx, size_t k); +size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(size_t m_idx, size_t k); /// Gets the offset in bytes for the packed RHS matrix, /// which contains the packed Signed 4-bit quantized symmetric per-channel (qsi4cxp) values. @@ -74,7 +74,7 @@ size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_n /// @param[in] k The common dimension between the LHS and RHS matrix (K). /// /// @return the offset in bytes to the packed RHS matrix -size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( +size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t n_idx, // size_t k); @@ -85,7 +85,7 @@ size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_n /// @param[in] dst_stride The number of bytes in in each row of the DST matrix /// /// @return the DST offset in bytes -size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( +size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t m_idx, // size_t n_idx, // size_t dst_stride); @@ -96,7 +96,7 @@ size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dot /// @param[in] n Number of columns in the destination (DST) matrix. /// /// @return the destination (DST) matrix size in bytes -size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod(size_t m, size_t n); +size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod(size_t m, size_t n); /// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. /// @@ -114,15 +114,16 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotpr /// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs /// both the dynamic quantization to 8-bit and activation packing in a single step. /// @param[in] rhs_packed The RHS packed matrix, which is obtained by calling @ref -/// kai_run_rhs_pack_kxn_qsi4cxp_qsu4cxs1s0 -/// OR kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// kai_run_rhs_pack_kxn_qsi4cxp_qs4cxs1s0 +/// OR kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0 /// @param[out] dst The DST matrix. /// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. /// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. It must be sizeof(float). /// @param[in] scalar_min Min value used to clamp the final result. /// @param[in] scalar_max Max value used to clamp the final result. -void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod( +void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t m, size_t n, size_t k, + const void* lhs_packed, // const void* rhs_packed, // float* dst, // diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 5be8bb2c..53aaf00b 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -15,11 +15,11 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h" -#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_16x4x32_neon_dotprod.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h" -#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h" @@ -40,8 +40,8 @@ static const std::array Date: Wed, 9 Oct 2024 09:42:33 +0100 Subject: [PATCH 11/12] Add sdot gemm kernels to the dotprod variants in benchmarks after rebase Signed-off-by: Anitha Raj --- benchmark/matmul/matmul_f32.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/benchmark/matmul/matmul_f32.cpp b/benchmark/matmul/matmul_f32.cpp index e956a7c5..bc6b712e 100644 --- a/benchmark/matmul/matmul_f32.cpp +++ b/benchmark/matmul/matmul_f32.cpp @@ -16,8 +16,10 @@ #include "benchmark/matmul/matmul_utils.hpp" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h" @@ -154,6 +156,30 @@ kai_matmul_ukernel_f32_qa8dxp_qs4cxp ukernel_variants[] = { kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod, kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod, "matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod"}, + {kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod, + "matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod"}, }; void RegisterBenchmarks(size_t m, size_t n, size_t k) { -- GitLab From b6c2744d79035a6cbb08da6a76219c721707a201 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Mon, 14 Oct 2024 11:32:41 +0100 Subject: [PATCH 12/12] Address review comments: White space and Changelog edits Signed-off-by: Anitha Raj --- CHANGELOG.md | 2 +- ...atmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h | 1 - ...matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc09e527..d0b00dc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo ## v0.4.0 -- Upcoming Release -- Micro-kernels to compute the matrix multiplication of dynamically quantized 8-bit integer (QAI8DX) activations and quantized 4-bit integer (QSI4CX) weights and the accumulation of the result into a single-precision (F32) output, optimized using the Arm® CPU feature FEAT_DotProd. +- Micro-kernels to compute the matrix multiplication of dynamically quantized 8-bit integer (QAI8DX) LHS matrix, which typically holds the neural network activations, and quantized 4-bit integer (QSI4CX) RHS matrix, which typically holds the neural network weights, and the accumulation of the result into a single-precision (F32) output, optimized using the Arm® CPU feature FEAT_DotProd. ## v0.3.0 diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h index 3227e1e2..63c538be 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod.h @@ -123,7 +123,6 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotp /// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x4_16x4x32_neon_dotprod( size_t m, size_t n, size_t k, - const void* lhs_packed, // const void* rhs_packed, // float* dst, // diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h index 5d827392..6e77b84a 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod.h @@ -123,7 +123,6 @@ size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotpr /// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x4_8x8x32_neon_dotprod( size_t m, size_t n, size_t k, - const void* lhs_packed, // const void* rhs_packed, // float* dst, // -- GitLab