From e9ee1be9af426187db194a4dbc362a431c205e03 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 21 May 2024 15:53:44 +0100 Subject: [PATCH 1/3] Add GeMM FP16 and BF16 hybrid kernels * Add GeMM hybrid kernels. - Block size: 6x16. - Data type combinations : - FP16 FP16 FP16 FP16 - FP32 BF16 BF16 FP32 * Update test framework and add tests for new kernels. Signed-off-by: Viet-Hoa Do --- .editorconfig | 4 + .pre-commit-config.yaml | 2 +- CMakeLists.txt | 21 + src/kai_common.h | 5 + ...clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c | 3023 ++++++++++++++ ...clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h | 110 + ...mp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c | 3704 +++++++++++++++++ ...mp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h | 111 + ...ias_pack_transpose_f16_f16p16x1zf16_neon.c | 220 + ...ias_pack_transpose_f16_f16p16x1zf16_neon.h | 90 + ...ack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c | 498 +++ ...ack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h | 90 + test/common/bfloat16.cpp | 17 + test/common/bfloat16.hpp | 88 + test/common/compare.cpp | 136 +- test/common/data_format.cpp | 91 +- test/common/data_format.hpp | 57 +- test/common/data_type.cpp | 6 +- test/common/data_type.hpp | 2 + test/common/float16.cpp | 17 + test/common/float16.hpp | 24 + test/common/int4.cpp | 16 + test/common/int4.hpp | 6 + test/common/printer.cpp | 35 +- test/common/type_traits.hpp | 21 + test/reference/binary_elementwise.cpp | 4 + test/reference/cast.cpp | 44 + test/reference/cast.hpp | 28 + test/reference/fill.cpp | 28 +- test/reference/matmul.cpp | 81 +- test/reference/matmul.hpp | 32 +- test/reference/pack.cpp | 83 +- test/reference/transpose.cpp | 38 + test/reference/transpose.hpp | 27 + test/tests/matmul_test.cpp | 301 +- 35 files changed, 8820 insertions(+), 240 deletions(-) create mode 100644 src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c create mode 100644 src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h create mode 100644 src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c create mode 100644 src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h create mode 100644 src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c create mode 100644 src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h create mode 100644 src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c create mode 100644 src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h create mode 100644 test/common/bfloat16.cpp create mode 100644 test/common/bfloat16.hpp create mode 100644 test/common/float16.cpp create mode 100644 test/common/float16.hpp create mode 100644 test/reference/cast.cpp create mode 100644 test/reference/cast.hpp create mode 100644 test/reference/transpose.cpp create mode 100644 test/reference/transpose.hpp diff --git a/.editorconfig b/.editorconfig index 6b724e49..b5e5f7d0 100644 --- a/.editorconfig +++ b/.editorconfig @@ -16,6 +16,10 @@ indent_style = space insert_final_newline = true trim_trailing_whitespace = true +# Override settings. +[*.{c,h,cpp,hpp}] +indent_size = unset # Avoid conflict with clang-format. + # Override settings. [*.{json,yml,yaml}] indent_size = 2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e6eb382c..8757ead2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: rev: v4.5.0 hooks: - id: check-added-large-files - args: [ '--maxkb=100' ] + args: [ '--maxkb=200' ] stages: [ commit ] name: "File size limit not exceeded" - id: check-yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f3e82a8..fd7d79c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,22 @@ set(KLEIDIAI_WARNING_FLAGS $<$:${KLEIDIAI_WARNING_FLAGS_CXX}> ) +add_library(kleidiai + src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c + src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c + src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c + src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c +) + +target_include_directories(kleidiai + PUBLIC src + PRIVATE . +) + +target_compile_options(kleidiai + PRIVATE ${KLEIDIAI_WARNING_FLAGS} +) + if(KLEIDIAI_BUILD_TESTS) enable_testing() include(GoogleTest) @@ -68,6 +84,8 @@ if(KLEIDIAI_BUILD_TESTS) test/common/compare.cpp test/common/matrix_portion.cpp test/common/rect.cpp + test/common/bfloat16.cpp + test/common/float16.cpp test/reference/binary_elementwise.cpp test/reference/matmul.cpp @@ -76,6 +94,8 @@ if(KLEIDIAI_BUILD_TESTS) test/reference/quantize.cpp test/reference/reduce.cpp test/reference/round.cpp + test/reference/transpose.cpp + test/reference/cast.cpp test/tests/matmul_test.cpp ) @@ -90,6 +110,7 @@ if(KLEIDIAI_BUILD_TESTS) target_link_libraries(kleidiai_test PRIVATE GTest::gtest_main + PRIVATE kleidiai ) # Cross-compiling is a common use case which creates a conflict if DISCOVERY_MODE is set to POST_BUILD (by default) diff --git a/src/kai_common.h b/src/kai_common.h index b44dad5b..a2548d5b 100644 --- a/src/kai_common.h +++ b/src/kai_common.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include @@ -40,3 +41,7 @@ #define KAI_ASSUME_IF KAI_ASSERT_IF #define KAI_UNUSED(x) (void)(x) + +static inline size_t kai_round_up_multiple_usize(size_t a, size_t b) { + return (a + b - 1) / b * b; +} diff --git a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c b/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c new file mode 100644 index 00000000..685ccf43 --- /dev/null +++ b/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c @@ -0,0 +1,3023 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "kai_common.h" + +static const size_t block_height = 6; +static const size_t block_width = 16; + +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m) { + KAI_UNUSED(m); + + return 6; +} + +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n) { + KAI_UNUSED(n); + + return 16; +} + +size_t kai_get_lhs_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k) { + return k * sizeof(__fp16); +} + +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t k_idx, size_t stride) { + KAI_ASSUME(m_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + + return m_idx * stride; +} + +size_t kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k, size_t n_idx, size_t k_idx) { + KAI_ASSUME(n_idx % block_width == 0); + KAI_ASSUME(k_idx == 0); + + return n_idx / block_width * (block_width * sizeof(__fp16) + block_width * k * sizeof(__fp16)); +} + +size_t kai_get_dst_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n) { + return n * sizeof(__fp16); +} + +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t n_idx, size_t stride) { + KAI_ASSUME(m_idx % block_height == 0); + KAI_ASSUME(n_idx % block_width == 0); + + return m_idx * stride + n_idx * sizeof(__fp16); +} + +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m, size_t n, size_t stride) { + return m * stride + n * sizeof(__fp16); +} + +void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( + size_t m, size_t n, size_t k, // + const void* lhs, const void* packed_rhs, void* dst, // + size_t lhs_stride, size_t dst_stride, // + __fp16 clamp_min, __fp16 clamp_max) { + typedef struct { + __fp16 maxval; + __fp16 minval; + unsigned int num_strings; + const unsigned int* string_lengths; + size_t N; + const void* B_ptr; + size_t output_offset; + size_t input_initial_col; + size_t input_offset; + void* output_ptr; + const void* bias; + } KernelArgs; + + KernelArgs ka; + + unsigned long flags = 0; + + unsigned int string_length = k; + ka.num_strings = 1; + ka.string_lengths = &string_length; + ka.N = n; + ka.B_ptr = packed_rhs; + ka.bias = NULL; + + // Direct input. + const void* input_ptr = lhs; + ka.input_offset = lhs_stride / sizeof(__fp16); + ka.input_initial_col = 0; + + // Direct output. + ka.output_ptr = dst; + ka.output_offset = dst_stride / sizeof(__fp16); + + // Clamping output. + flags |= 0x2; + ka.maxval = clamp_max; + ka.minval = clamp_min; + + __asm__ __volatile__( + "1:" // Row loop + "cmp %x[m], #0x6\n" + "bge 166f\n" + "cmp %x[m], #0x4\n" + "bgt 133f\n" + "beq 100f\n" + "cmp %x[m], #0x2\n" + "bgt 67f\n" + "beq 34f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "2:" // Height 1: Column loop + "cbz x10, 3f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "b 14f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 13f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "tbz x11, #2, 5f\n" + "ldr d21, [x9], #0x8\n" + "tbz x11, #1, 4f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "mov x20, #0x1c\n" + "tbz x11, #0, 11f\n" + "ld1 { v21.h }[6], [x9]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 11f\n" + "ld1 { v21.h }[4], [x9]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr s21, [x9], #0x4\n" + "mov x20, #0x14\n" + "tbz x11, #0, 11f\n" + "ld1 { v21.h }[2], [x9]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 11f\n" + "ldr h21, [x9, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ldr d20, [x9], #0x8\n" + "tbz x11, #1, 8f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "mov x20, #0xc\n" + "tbz x11, #0, 11f\n" + "ld1 { v20.h }[6], [x9]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v20.h }[4], [x9]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr s20, [x9], #0x4\n" + "mov x20, #0x4\n" + "tbz x11, #0, 11f\n" + "ld1 { v20.h }[2], [x9]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "mov x20, #0x0\n" + "11:" // Height 1: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 14f\n" + "12:" // Height 1: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "b 14f\n" + "13:" // Height 1: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "14:" // Height 1: setup done + "mov x28, #0x0\n" + "15:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 16f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 17f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "b 17f\n" + "16:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "17:" // Height 1: input setup done + "cmp x27, #0x8\n" + "blt 20f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 19f\n" + "18:" // Height 1: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "cmp x27, #0x10\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 18b\n" + "19:" // Height 1: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x26, x26, #0x10\n" + "sub x27, x27, #0x8\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "20:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 22f\n" + "21:" // Height 1: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr q8, [x10, #0x0]\n" + "sub x27, x27, #0x1\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "cbnz x27, 21b\n" + "22:" // Height 1: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 15b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 23f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "23:" // Height 1: No activation + "cmp x11, #0x10\n" + "bge 32f\n" + "tbz x11, #3, 27f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "tbz x11, #2, 25f\n" + "str d21, [x9], #0x8\n" + "tbz x11, #1, 24f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "tbz x11, #0, 31f\n" + "st1 { v21.h }[6], [x9]\n" + "b 31f\n" + "24:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 31f\n" + "st1 { v21.h }[4], [x9]\n" + "b 31f\n" + "25:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 26f\n" + "str s21, [x9], #0x4\n" + "tbz x11, #0, 31f\n" + "st1 { v21.h }[2], [x9]\n" + "b 31f\n" + "26:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 31f\n" + "str h21, [x9, #0x0]\n" + "b 31f\n" + "27:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 29f\n" + "str d20, [x9], #0x8\n" + "tbz x11, #1, 28f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "tbz x11, #0, 31f\n" + "st1 { v20.h }[6], [x9]\n" + "b 31f\n" + "28:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 31f\n" + "st1 { v20.h }[4], [x9]\n" + "b 31f\n" + "29:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 30f\n" + "str s20, [x9], #0x4\n" + "tbz x11, #0, 31f\n" + "st1 { v20.h }[2], [x9]\n" + "b 31f\n" + "30:" // Height 1: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "31:" // Height 1: Partial direct writeback: Done + "b 33f\n" + "32:" // Height 1: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "33:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 200f\n" + "34:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "35:" // Height 2: Column loop + "cbz x10, 36f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "mov v22.16b, v20.16b\n" + "mov v23.16b, v21.16b\n" + "b 47f\n" + "36:" // Height 2: no bias + "tbz %x[flags], #0, 46f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #1\n" + "bge 45f\n" + "tbz x11, #3, 40f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "ld1 { v22.8h }, [x26], #0x10\n" + "tbz x11, #2, 38f\n" + "ldr d21, [x9], #0x8\n" + "ldr d23, [x26], #0x8\n" + "tbz x11, #1, 37f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "tbz x11, #0, 44f\n" + "ld1 { v21.h }[6], [x9]\n" + "ld1 { v23.h }[6], [x26]\n" + "b 44f\n" + "37:" // Height 2: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 44f\n" + "ld1 { v21.h }[4], [x9]\n" + "ld1 { v23.h }[4], [x26]\n" + "b 44f\n" + "38:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 39f\n" + "ldr s21, [x9], #0x4\n" + "ldr s23, [x26], #0x4\n" + "mov x20, #0x14\n" + "tbz x11, #0, 44f\n" + "ld1 { v21.h }[2], [x9]\n" + "ld1 { v23.h }[2], [x26]\n" + "b 44f\n" + "39:" // Height 2: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 44f\n" + "ldr h21, [x9, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "b 44f\n" + "40:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 42f\n" + "ldr d20, [x9], #0x8\n" + "ldr d22, [x26], #0x8\n" + "tbz x11, #1, 41f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "tbz x11, #0, 44f\n" + "ld1 { v20.h }[6], [x9]\n" + "ld1 { v22.h }[6], [x26]\n" + "b 44f\n" + "41:" // Height 2: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 44f\n" + "ld1 { v20.h }[4], [x9]\n" + "ld1 { v22.h }[4], [x26]\n" + "b 44f\n" + "42:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 43f\n" + "ldr s20, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "mov x20, #0x4\n" + "tbz x11, #0, 44f\n" + "ld1 { v20.h }[2], [x9]\n" + "ld1 { v22.h }[2], [x26]\n" + "b 44f\n" + "43:" // Height 2: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "ldr h22, [x26, #0x0]\n" + "mov x20, #0x0\n" + "44:" // Height 2: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 47f\n" + "45:" // Height 2: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "ldr q22, [x26, #0x0]\n" + "ldr q23, [x26, #0x10]\n" + "b 47f\n" + "46:" // Height 2: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "47:" // Height 2: setup done + "mov x28, #0x0\n" + "48:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 49f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "cbnz x28, 50f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "b 50f\n" + "49:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "50:" // Height 2: input setup done + "cmp x27, #0x8\n" + "blt 53f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 52f\n" + "51:" // Height 2: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "sub x27, x27, #0x8\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 51b\n" + "52:" // Height 2: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "add x26, x26, #0x10\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x25, x25, #0x10\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "53:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 55f\n" + "54:" // Height 2: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v22.8h, v8.8h, v1.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "fmla v23.8h, v9.8h, v1.h[0]\n" + "cbnz x27, 54b\n" + "55:" // Height 2: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 48b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 56f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmin v22.8h, v22.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "fmax v22.8h, v22.8h, v16.8h\n" + "fmax v23.8h, v23.8h, v16.8h\n" + "56:" // Height 2: No activation + "cmp x11, #0x10\n" + "bge 65f\n" + "tbz x11, #3, 60f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "st1 { v22.8h }, [x26], #0x10\n" + "tbz x11, #2, 58f\n" + "str d21, [x9], #0x8\n" + "str d23, [x26], #0x8\n" + "tbz x11, #1, 57f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "tbz x11, #0, 64f\n" + "st1 { v21.h }[6], [x9]\n" + "st1 { v23.h }[6], [x26]\n" + "b 64f\n" + "57:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 64f\n" + "st1 { v21.h }[4], [x9]\n" + "st1 { v23.h }[4], [x26]\n" + "b 64f\n" + "58:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 59f\n" + "str s21, [x9], #0x4\n" + "str s23, [x26], #0x4\n" + "tbz x11, #0, 64f\n" + "st1 { v21.h }[2], [x9]\n" + "st1 { v23.h }[2], [x26]\n" + "b 64f\n" + "59:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 64f\n" + "str h21, [x9, #0x0]\n" + "str h23, [x26, #0x0]\n" + "b 64f\n" + "60:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 62f\n" + "str d20, [x9], #0x8\n" + "str d22, [x26], #0x8\n" + "tbz x11, #1, 61f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "st1 { v22.s }[2], [x26], #0x4\n" + "tbz x11, #0, 64f\n" + "st1 { v20.h }[6], [x9]\n" + "st1 { v22.h }[6], [x26]\n" + "b 64f\n" + "61:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 64f\n" + "st1 { v20.h }[4], [x9]\n" + "st1 { v22.h }[4], [x26]\n" + "b 64f\n" + "62:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 63f\n" + "str s20, [x9], #0x4\n" + "str s22, [x26], #0x4\n" + "tbz x11, #0, 64f\n" + "st1 { v20.h }[2], [x9]\n" + "st1 { v22.h }[2], [x26]\n" + "b 64f\n" + "63:" // Height 2: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "str h22, [x26, #0x0]\n" + "64:" // Height 2: Partial direct writeback: Done + "b 66f\n" + "65:" // Height 2: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "str q22, [x26, #0x0]\n" + "str q23, [x26, #0x10]\n" + "66:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 35b\n" + "b 200f\n" + "67:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "68:" // Height 3: Column loop + "cbz x10, 69f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "mov v22.16b, v20.16b\n" + "mov v23.16b, v21.16b\n" + "mov v24.16b, v20.16b\n" + "mov v25.16b, v21.16b\n" + "b 80f\n" + "69:" // Height 3: no bias + "tbz %x[flags], #0, 79f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "bge 78f\n" + "tbz x11, #3, 73f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "ld1 { v22.8h }, [x26], #0x10\n" + "ld1 { v24.8h }, [x25], #0x10\n" + "tbz x11, #2, 71f\n" + "ldr d21, [x9], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "tbz x11, #1, 70f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "tbz x11, #0, 77f\n" + "ld1 { v21.h }[6], [x9]\n" + "ld1 { v23.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "b 77f\n" + "70:" // Height 3: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 77f\n" + "ld1 { v21.h }[4], [x9]\n" + "ld1 { v23.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" + "b 77f\n" + "71:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 72f\n" + "ldr s21, [x9], #0x4\n" + "ldr s23, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s25, [x25], #0x4\n" + "tbz x11, #0, 77f\n" + "ld1 { v21.h }[2], [x9]\n" + "ld1 { v23.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "b 77f\n" + "72:" // Height 3: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 77f\n" + "ldr h21, [x9, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "b 77f\n" + "73:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 75f\n" + "ldr d20, [x9], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "tbz x11, #1, 74f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "tbz x11, #0, 77f\n" + "ld1 { v20.h }[6], [x9]\n" + "ld1 { v22.h }[6], [x26]\n" + "ld1 { v24.h }[6], [x25]\n" + "b 77f\n" + "74:" // Height 3: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 77f\n" + "ld1 { v20.h }[4], [x9]\n" + "ld1 { v22.h }[4], [x26]\n" + "ld1 { v24.h }[4], [x25]\n" + "b 77f\n" + "75:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 76f\n" + "ldr s20, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s24, [x25], #0x4\n" + "tbz x11, #0, 77f\n" + "ld1 { v20.h }[2], [x9]\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "b 77f\n" + "76:" // Height 3: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "ldr h22, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h24, [x25, #0x0]\n" + "77:" // Height 3: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 80f\n" + "78:" // Height 3: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "ldr q22, [x26, #0x0]\n" + "ldr q23, [x26, #0x10]\n" + "ldr q24, [x25, #0x0]\n" + "ldr q25, [x25, #0x10]\n" + "b 80f\n" + "79:" // Height 3: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "80:" // Height 3: setup done + "mov x28, #0x0\n" + "81:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 82f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x28, 83f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "b 83f\n" + "82:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "83:" // Height 3: input setup done + "cmp x27, #0x8\n" + "blt 86f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 85f\n" + "84:" // Height 3: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "add x25, x25, #0x10\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x24, x24, #0x10\n" + "cmp x27, #0x10\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 84b\n" + "85:" // Height 3: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x25, #0x80]\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "86:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 88f\n" + "87:" // Height 3: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v22.8h, v8.8h, v1.h[0]\n" + "fmla v24.8h, v8.8h, v2.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "fmla v23.8h, v9.8h, v1.h[0]\n" + "fmla v25.8h, v9.8h, v2.h[0]\n" + "cbnz x27, 87b\n" + "88:" // Height 3: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 81b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 89f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmin v22.8h, v22.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "fmax v22.8h, v22.8h, v16.8h\n" + "fmax v23.8h, v23.8h, v16.8h\n" + "fmax v24.8h, v24.8h, v16.8h\n" + "fmax v25.8h, v25.8h, v16.8h\n" + "89:" // Height 3: No activation + "cmp x11, #0x10\n" + "bge 98f\n" + "tbz x11, #3, 93f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "st1 { v22.8h }, [x26], #0x10\n" + "st1 { v24.8h }, [x25], #0x10\n" + "tbz x11, #2, 91f\n" + "str d21, [x9], #0x8\n" + "str d23, [x26], #0x8\n" + "str d25, [x25], #0x8\n" + "tbz x11, #1, 90f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v25.s }[2], [x25], #0x4\n" + "tbz x11, #0, 97f\n" + "st1 { v21.h }[6], [x9]\n" + "st1 { v23.h }[6], [x26]\n" + "st1 { v25.h }[6], [x25]\n" + "b 97f\n" + "90:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 97f\n" + "st1 { v21.h }[4], [x9]\n" + "st1 { v23.h }[4], [x26]\n" + "st1 { v25.h }[4], [x25]\n" + "b 97f\n" + "91:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 92f\n" + "str s21, [x9], #0x4\n" + "str s23, [x26], #0x4\n" + "str s25, [x25], #0x4\n" + "tbz x11, #0, 97f\n" + "st1 { v21.h }[2], [x9]\n" + "st1 { v23.h }[2], [x26]\n" + "st1 { v25.h }[2], [x25]\n" + "b 97f\n" + "92:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 97f\n" + "str h21, [x9, #0x0]\n" + "str h23, [x26, #0x0]\n" + "str h25, [x25, #0x0]\n" + "b 97f\n" + "93:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 95f\n" + "str d20, [x9], #0x8\n" + "str d22, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "tbz x11, #1, 94f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "st1 { v22.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "tbz x11, #0, 97f\n" + "st1 { v20.h }[6], [x9]\n" + "st1 { v22.h }[6], [x26]\n" + "st1 { v24.h }[6], [x25]\n" + "b 97f\n" + "94:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 97f\n" + "st1 { v20.h }[4], [x9]\n" + "st1 { v22.h }[4], [x26]\n" + "st1 { v24.h }[4], [x25]\n" + "b 97f\n" + "95:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 96f\n" + "str s20, [x9], #0x4\n" + "str s22, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "tbz x11, #0, 97f\n" + "st1 { v20.h }[2], [x9]\n" + "st1 { v22.h }[2], [x26]\n" + "st1 { v24.h }[2], [x25]\n" + "b 97f\n" + "96:" // Height 3: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "str h22, [x26, #0x0]\n" + "str h24, [x25, #0x0]\n" + "97:" // Height 3: Partial direct writeback: Done + "b 99f\n" + "98:" // Height 3: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "str q22, [x26, #0x0]\n" + "str q23, [x26, #0x10]\n" + "str q24, [x25, #0x0]\n" + "str q25, [x25, #0x10]\n" + "99:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 68b\n" + "b 200f\n" + "100:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "101:" // Height 4: Column loop + "cbz x10, 102f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "mov v22.16b, v20.16b\n" + "mov v23.16b, v21.16b\n" + "mov v24.16b, v20.16b\n" + "mov v25.16b, v21.16b\n" + "mov v26.16b, v20.16b\n" + "mov v27.16b, v21.16b\n" + "b 113f\n" + "102:" // Height 4: no bias + "tbz %x[flags], #0, 112f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "bge 111f\n" + "tbz x11, #3, 106f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "ld1 { v22.8h }, [x26], #0x10\n" + "ld1 { v24.8h }, [x25], #0x10\n" + "ld1 { v26.8h }, [x24], #0x10\n" + "tbz x11, #2, 104f\n" + "ldr d21, [x9], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "tbz x11, #1, 103f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "tbz x11, #0, 110f\n" + "ld1 { v21.h }[6], [x9]\n" + "ld1 { v23.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "b 110f\n" + "103:" // Height 4: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 110f\n" + "ld1 { v21.h }[4], [x9]\n" + "ld1 { v23.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "b 110f\n" + "104:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 105f\n" + "ldr s21, [x9], #0x4\n" + "ldr s23, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s25, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "tbz x11, #0, 110f\n" + "ld1 { v21.h }[2], [x9]\n" + "ld1 { v23.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "b 110f\n" + "105:" // Height 4: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 110f\n" + "ldr h21, [x9, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "b 110f\n" + "106:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 108f\n" + "ldr d20, [x9], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "tbz x11, #1, 107f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "tbz x11, #0, 110f\n" + "ld1 { v20.h }[6], [x9]\n" + "ld1 { v22.h }[6], [x26]\n" + "ld1 { v24.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "b 110f\n" + "107:" // Height 4: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 110f\n" + "ld1 { v20.h }[4], [x9]\n" + "ld1 { v22.h }[4], [x26]\n" + "ld1 { v24.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "b 110f\n" + "108:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 109f\n" + "ldr s20, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s24, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "tbz x11, #0, 110f\n" + "ld1 { v20.h }[2], [x9]\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "b 110f\n" + "109:" // Height 4: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "ldr h22, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h24, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "110:" // Height 4: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 113f\n" + "111:" // Height 4: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "ldr q22, [x26, #0x0]\n" + "ldr q23, [x26, #0x10]\n" + "ldr q24, [x25, #0x0]\n" + "ldr q25, [x25, #0x10]\n" + "ldr q26, [x24, #0x0]\n" + "ldr q27, [x24, #0x10]\n" + "b 113f\n" + "112:" // Height 4: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "113:" // Height 4: setup done + "mov x28, #0x0\n" + "114:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 115f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x28, 116f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "b 116f\n" + "115:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "116:" // Height 4: input setup done + "cmp x27, #0x8\n" + "blt 119f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 118f\n" + "117:" // Height 4: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "add x25, x25, #0x10\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "cmp x27, #0x10\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "add x10, x10, #0x100\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 117b\n" + "118:" // Height 4: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "add x24, x24, #0x10\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "add x23, x23, #0x10\n" + "sub x27, x27, #0x8\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "add x10, x10, #0x100\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "119:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 121f\n" + "120:" // Height 4: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v22.8h, v8.8h, v1.h[0]\n" + "fmla v24.8h, v8.8h, v2.h[0]\n" + "fmla v26.8h, v8.8h, v3.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "fmla v23.8h, v9.8h, v1.h[0]\n" + "fmla v25.8h, v9.8h, v2.h[0]\n" + "fmla v27.8h, v9.8h, v3.h[0]\n" + "cbnz x27, 120b\n" + "121:" // Height 4: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 114b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 122f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmin v22.8h, v22.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "fmax v22.8h, v22.8h, v16.8h\n" + "fmax v23.8h, v23.8h, v16.8h\n" + "fmax v24.8h, v24.8h, v16.8h\n" + "fmax v25.8h, v25.8h, v16.8h\n" + "fmax v26.8h, v26.8h, v16.8h\n" + "fmax v27.8h, v27.8h, v16.8h\n" + "122:" // Height 4: No activation + "cmp x11, #0x10\n" + "bge 131f\n" + "tbz x11, #3, 126f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "st1 { v22.8h }, [x26], #0x10\n" + "st1 { v24.8h }, [x25], #0x10\n" + "st1 { v26.8h }, [x24], #0x10\n" + "tbz x11, #2, 124f\n" + "str d21, [x9], #0x8\n" + "str d23, [x26], #0x8\n" + "str d25, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "tbz x11, #1, 123f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v25.s }[2], [x25], #0x4\n" + "st1 { v27.s }[2], [x24], #0x4\n" + "tbz x11, #0, 130f\n" + "st1 { v21.h }[6], [x9]\n" + "st1 { v23.h }[6], [x26]\n" + "st1 { v25.h }[6], [x25]\n" + "st1 { v27.h }[6], [x24]\n" + "b 130f\n" + "123:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 130f\n" + "st1 { v21.h }[4], [x9]\n" + "st1 { v23.h }[4], [x26]\n" + "st1 { v25.h }[4], [x25]\n" + "st1 { v27.h }[4], [x24]\n" + "b 130f\n" + "124:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 125f\n" + "str s21, [x9], #0x4\n" + "str s23, [x26], #0x4\n" + "str s25, [x25], #0x4\n" + "str s27, [x24], #0x4\n" + "tbz x11, #0, 130f\n" + "st1 { v21.h }[2], [x9]\n" + "st1 { v23.h }[2], [x26]\n" + "st1 { v25.h }[2], [x25]\n" + "st1 { v27.h }[2], [x24]\n" + "b 130f\n" + "125:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 130f\n" + "str h21, [x9, #0x0]\n" + "str h23, [x26, #0x0]\n" + "str h25, [x25, #0x0]\n" + "str h27, [x24, #0x0]\n" + "b 130f\n" + "126:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 128f\n" + "str d20, [x9], #0x8\n" + "str d22, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d26, [x24], #0x8\n" + "tbz x11, #1, 127f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "st1 { v22.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v26.s }[2], [x24], #0x4\n" + "tbz x11, #0, 130f\n" + "st1 { v20.h }[6], [x9]\n" + "st1 { v22.h }[6], [x26]\n" + "st1 { v24.h }[6], [x25]\n" + "st1 { v26.h }[6], [x24]\n" + "b 130f\n" + "127:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 130f\n" + "st1 { v20.h }[4], [x9]\n" + "st1 { v22.h }[4], [x26]\n" + "st1 { v24.h }[4], [x25]\n" + "st1 { v26.h }[4], [x24]\n" + "b 130f\n" + "128:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 129f\n" + "str s20, [x9], #0x4\n" + "str s22, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s26, [x24], #0x4\n" + "tbz x11, #0, 130f\n" + "st1 { v20.h }[2], [x9]\n" + "st1 { v22.h }[2], [x26]\n" + "st1 { v24.h }[2], [x25]\n" + "st1 { v26.h }[2], [x24]\n" + "b 130f\n" + "129:" // Height 4: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "str h22, [x26, #0x0]\n" + "str h24, [x25, #0x0]\n" + "str h26, [x24, #0x0]\n" + "130:" // Height 4: Partial direct writeback: Done + "b 132f\n" + "131:" // Height 4: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "str q22, [x26, #0x0]\n" + "str q23, [x26, #0x10]\n" + "str q24, [x25, #0x0]\n" + "str q25, [x25, #0x10]\n" + "str q26, [x24, #0x0]\n" + "str q27, [x24, #0x10]\n" + "132:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 101b\n" + "b 200f\n" + "133:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "134:" // Height 5: Column loop + "cbz x10, 135f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "mov v22.16b, v20.16b\n" + "mov v23.16b, v21.16b\n" + "mov v24.16b, v20.16b\n" + "mov v25.16b, v21.16b\n" + "mov v26.16b, v20.16b\n" + "mov v27.16b, v21.16b\n" + "mov v28.16b, v20.16b\n" + "mov v29.16b, v21.16b\n" + "b 146f\n" + "135:" // Height 5: no bias + "tbz %x[flags], #0, 145f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "add x23, x24, x20, LSL #1\n" + "bge 144f\n" + "tbz x11, #3, 139f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "ld1 { v22.8h }, [x26], #0x10\n" + "ld1 { v24.8h }, [x25], #0x10\n" + "ld1 { v26.8h }, [x24], #0x10\n" + "ld1 { v28.8h }, [x23], #0x10\n" + "tbz x11, #2, 137f\n" + "ldr d21, [x9], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d29, [x23], #0x8\n" + "tbz x11, #1, 136f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v29.s }[2], [x23], #0x4\n" + "tbz x11, #0, 143f\n" + "ld1 { v21.h }[6], [x9]\n" + "ld1 { v23.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v29.h }[6], [x23]\n" + "b 143f\n" + "136:" // Height 5: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 143f\n" + "ld1 { v21.h }[4], [x9]\n" + "ld1 { v23.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v29.h }[4], [x23]\n" + "b 143f\n" + "137:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 138f\n" + "ldr s21, [x9], #0x4\n" + "ldr s23, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s25, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s29, [x23], #0x4\n" + "tbz x11, #0, 143f\n" + "ld1 { v21.h }[2], [x9]\n" + "ld1 { v23.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v29.h }[2], [x23]\n" + "b 143f\n" + "138:" // Height 5: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 143f\n" + "ldr h21, [x9, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h29, [x23, #0x0]\n" + "b 143f\n" + "139:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 141f\n" + "ldr d20, [x9], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "tbz x11, #1, 140f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "tbz x11, #0, 143f\n" + "ld1 { v20.h }[6], [x9]\n" + "ld1 { v22.h }[6], [x26]\n" + "ld1 { v24.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v28.h }[6], [x23]\n" + "b 143f\n" + "140:" // Height 5: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 143f\n" + "ld1 { v20.h }[4], [x9]\n" + "ld1 { v22.h }[4], [x26]\n" + "ld1 { v24.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v28.h }[4], [x23]\n" + "b 143f\n" + "141:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 142f\n" + "ldr s20, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s24, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "tbz x11, #0, 143f\n" + "ld1 { v20.h }[2], [x9]\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v28.h }[2], [x23]\n" + "b 143f\n" + "142:" // Height 5: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "ldr h22, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h24, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h28, [x23, #0x0]\n" + "143:" // Height 5: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 146f\n" + "144:" // Height 5: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "ldr q22, [x26, #0x0]\n" + "ldr q23, [x26, #0x10]\n" + "ldr q24, [x25, #0x0]\n" + "ldr q25, [x25, #0x10]\n" + "ldr q26, [x24, #0x0]\n" + "ldr q27, [x24, #0x10]\n" + "ldr q28, [x23, #0x0]\n" + "ldr q29, [x23, #0x10]\n" + "b 146f\n" + "145:" // Height 5: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "146:" // Height 5: setup done + "mov x28, #0x0\n" + "147:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 148f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 149f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "b 149f\n" + "148:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "149:" // Height 5: input setup done + "cmp x27, #0x8\n" + "blt 152f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 151f\n" + "150:" // Height 5: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v28.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x10\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "fmla v29.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v28.8h, v8.8h, v4.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "fmla v29.8h, v9.8h, v4.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "fmla v28.8h, v10.8h, v4.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "fmla v29.8h, v11.8h, v4.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "fmla v28.8h, v12.8h, v4.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "fmla v29.8h, v13.8h, v4.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "fmla v28.8h, v14.8h, v4.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "fmla v29.8h, v15.8h, v4.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "fmla v28.8h, v16.8h, v4.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "fmla v29.8h, v17.8h, v4.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "fmla v28.8h, v18.8h, v4.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "fmla v29.8h, v19.8h, v4.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "fmla v28.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v29.8h, v7.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 150b\n" + "151:" // Height 5: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v28.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "fmla v29.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "add x10, x10, #0x100\n" + "fmla v28.8h, v8.8h, v4.h[1]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "fmla v29.8h, v9.8h, v4.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "fmla v28.8h, v10.8h, v4.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "fmla v29.8h, v11.8h, v4.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "fmla v28.8h, v12.8h, v4.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "fmla v29.8h, v13.8h, v4.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "fmla v28.8h, v14.8h, v4.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "fmla v29.8h, v15.8h, v4.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "fmla v28.8h, v16.8h, v4.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "fmla v29.8h, v17.8h, v4.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "fmla v28.8h, v18.8h, v4.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "fmla v29.8h, v19.8h, v4.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "fmla v28.8h, v6.8h, v4.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "fmla v29.8h, v7.8h, v4.h[7]\n" + "152:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 154f\n" + "153:" // Height 5: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v22.8h, v8.8h, v1.h[0]\n" + "fmla v24.8h, v8.8h, v2.h[0]\n" + "fmla v26.8h, v8.8h, v3.h[0]\n" + "fmla v28.8h, v8.8h, v4.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "fmla v23.8h, v9.8h, v1.h[0]\n" + "fmla v25.8h, v9.8h, v2.h[0]\n" + "fmla v27.8h, v9.8h, v3.h[0]\n" + "fmla v29.8h, v9.8h, v4.h[0]\n" + "cbnz x27, 153b\n" + "154:" // Height 5: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 147b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x20, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 155f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmin v22.8h, v22.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "fmax v22.8h, v22.8h, v16.8h\n" + "fmax v23.8h, v23.8h, v16.8h\n" + "fmax v24.8h, v24.8h, v16.8h\n" + "fmax v25.8h, v25.8h, v16.8h\n" + "fmax v26.8h, v26.8h, v16.8h\n" + "fmax v27.8h, v27.8h, v16.8h\n" + "fmax v28.8h, v28.8h, v16.8h\n" + "fmax v29.8h, v29.8h, v16.8h\n" + "155:" // Height 5: No activation + "cmp x11, #0x10\n" + "bge 164f\n" + "tbz x11, #3, 159f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "st1 { v22.8h }, [x26], #0x10\n" + "st1 { v24.8h }, [x25], #0x10\n" + "st1 { v26.8h }, [x24], #0x10\n" + "st1 { v28.8h }, [x23], #0x10\n" + "tbz x11, #2, 157f\n" + "str d21, [x9], #0x8\n" + "str d23, [x26], #0x8\n" + "str d25, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "tbz x11, #1, 156f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v25.s }[2], [x25], #0x4\n" + "st1 { v27.s }[2], [x24], #0x4\n" + "st1 { v29.s }[2], [x23], #0x4\n" + "tbz x11, #0, 163f\n" + "st1 { v21.h }[6], [x9]\n" + "st1 { v23.h }[6], [x26]\n" + "st1 { v25.h }[6], [x25]\n" + "st1 { v27.h }[6], [x24]\n" + "st1 { v29.h }[6], [x23]\n" + "b 163f\n" + "156:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 163f\n" + "st1 { v21.h }[4], [x9]\n" + "st1 { v23.h }[4], [x26]\n" + "st1 { v25.h }[4], [x25]\n" + "st1 { v27.h }[4], [x24]\n" + "st1 { v29.h }[4], [x23]\n" + "b 163f\n" + "157:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 158f\n" + "str s21, [x9], #0x4\n" + "str s23, [x26], #0x4\n" + "str s25, [x25], #0x4\n" + "str s27, [x24], #0x4\n" + "str s29, [x23], #0x4\n" + "tbz x11, #0, 163f\n" + "st1 { v21.h }[2], [x9]\n" + "st1 { v23.h }[2], [x26]\n" + "st1 { v25.h }[2], [x25]\n" + "st1 { v27.h }[2], [x24]\n" + "st1 { v29.h }[2], [x23]\n" + "b 163f\n" + "158:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 163f\n" + "str h21, [x9, #0x0]\n" + "str h23, [x26, #0x0]\n" + "str h25, [x25, #0x0]\n" + "str h27, [x24, #0x0]\n" + "str h29, [x23, #0x0]\n" + "b 163f\n" + "159:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 161f\n" + "str d20, [x9], #0x8\n" + "str d22, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d26, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "tbz x11, #1, 160f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "st1 { v22.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v26.s }[2], [x24], #0x4\n" + "st1 { v28.s }[2], [x23], #0x4\n" + "tbz x11, #0, 163f\n" + "st1 { v20.h }[6], [x9]\n" + "st1 { v22.h }[6], [x26]\n" + "st1 { v24.h }[6], [x25]\n" + "st1 { v26.h }[6], [x24]\n" + "st1 { v28.h }[6], [x23]\n" + "b 163f\n" + "160:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 163f\n" + "st1 { v20.h }[4], [x9]\n" + "st1 { v22.h }[4], [x26]\n" + "st1 { v24.h }[4], [x25]\n" + "st1 { v26.h }[4], [x24]\n" + "st1 { v28.h }[4], [x23]\n" + "b 163f\n" + "161:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 162f\n" + "str s20, [x9], #0x4\n" + "str s22, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s26, [x24], #0x4\n" + "str s28, [x23], #0x4\n" + "tbz x11, #0, 163f\n" + "st1 { v20.h }[2], [x9]\n" + "st1 { v22.h }[2], [x26]\n" + "st1 { v24.h }[2], [x25]\n" + "st1 { v26.h }[2], [x24]\n" + "st1 { v28.h }[2], [x23]\n" + "b 163f\n" + "162:" // Height 5: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "str h22, [x26, #0x0]\n" + "str h24, [x25, #0x0]\n" + "str h26, [x24, #0x0]\n" + "str h28, [x23, #0x0]\n" + "163:" // Height 5: Partial direct writeback: Done + "b 165f\n" + "164:" // Height 5: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "str q22, [x26, #0x0]\n" + "str q23, [x26, #0x10]\n" + "str q24, [x25, #0x0]\n" + "str q25, [x25, #0x10]\n" + "str q26, [x24, #0x0]\n" + "str q27, [x24, #0x10]\n" + "str q28, [x23, #0x0]\n" + "str q29, [x23, #0x10]\n" + "165:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 134b\n" + "b 200f\n" + "166:" // Height 6 + "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "mov x20, #0xc\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "madd x20, x21, x20, x9\n" + "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "167:" // Height 6: Column loop + "cbz x10, 168f\n" + "ldr q20, [x10, #0x0]\n" + "ldr q21, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "mov v22.16b, v20.16b\n" + "mov v23.16b, v21.16b\n" + "mov v24.16b, v20.16b\n" + "mov v25.16b, v21.16b\n" + "mov v26.16b, v20.16b\n" + "mov v27.16b, v21.16b\n" + "mov v28.16b, v20.16b\n" + "mov v29.16b, v21.16b\n" + "mov v30.16b, v20.16b\n" + "mov v31.16b, v21.16b\n" + "b 179f\n" + "168:" // Height 6: no bias + "tbz %x[flags], #0, 178f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "add x23, x24, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" + "bge 177f\n" + "tbz x11, #3, 172f\n" + "ld1 { v20.8h }, [x9], #0x10\n" + "ld1 { v22.8h }, [x26], #0x10\n" + "ld1 { v24.8h }, [x25], #0x10\n" + "ld1 { v26.8h }, [x24], #0x10\n" + "ld1 { v28.8h }, [x23], #0x10\n" + "ld1 { v30.8h }, [x22], #0x10\n" + "tbz x11, #2, 170f\n" + "ldr d21, [x9], #0x8\n" + "ldr d23, [x26], #0x8\n" + "ldr d25, [x25], #0x8\n" + "ldr d27, [x24], #0x8\n" + "ldr d29, [x23], #0x8\n" + "ldr d31, [x22], #0x8\n" + "tbz x11, #1, 169f\n" + "ld1 { v21.s }[2], [x9], #0x4\n" + "ld1 { v23.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v25.s }[2], [x25], #0x4\n" + "ld1 { v27.s }[2], [x24], #0x4\n" + "ld1 { v29.s }[2], [x23], #0x4\n" + "ld1 { v31.s }[2], [x22], #0x4\n" + "tbz x11, #0, 176f\n" + "ld1 { v21.h }[6], [x9]\n" + "ld1 { v23.h }[6], [x26]\n" + "ld1 { v25.h }[6], [x25]\n" + "ld1 { v27.h }[6], [x24]\n" + "ld1 { v29.h }[6], [x23]\n" + "ld1 { v31.h }[6], [x22]\n" + "b 176f\n" + "169:" // Height 6: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 176f\n" + "ld1 { v21.h }[4], [x9]\n" + "ld1 { v23.h }[4], [x26]\n" + "ld1 { v25.h }[4], [x25]\n" + "ld1 { v27.h }[4], [x24]\n" + "ld1 { v29.h }[4], [x23]\n" + "ld1 { v31.h }[4], [x22]\n" + "b 176f\n" + "170:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 171f\n" + "ldr s21, [x9], #0x4\n" + "ldr s23, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s25, [x25], #0x4\n" + "ldr s27, [x24], #0x4\n" + "ldr s29, [x23], #0x4\n" + "ldr s31, [x22], #0x4\n" + "tbz x11, #0, 176f\n" + "ld1 { v21.h }[2], [x9]\n" + "ld1 { v23.h }[2], [x26]\n" + "ld1 { v25.h }[2], [x25]\n" + "ld1 { v27.h }[2], [x24]\n" + "ld1 { v29.h }[2], [x23]\n" + "ld1 { v31.h }[2], [x22]\n" + "b 176f\n" + "171:" // Height 6: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 176f\n" + "ldr h21, [x9, #0x0]\n" + "ldr h23, [x26, #0x0]\n" + "ldr h25, [x25, #0x0]\n" + "ldr h27, [x24, #0x0]\n" + "ldr h29, [x23, #0x0]\n" + "ldr h31, [x22, #0x0]\n" + "b 176f\n" + "172:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 174f\n" + "ldr d20, [x9], #0x8\n" + "ldr d22, [x26], #0x8\n" + "ldr d24, [x25], #0x8\n" + "ldr d26, [x24], #0x8\n" + "ldr d28, [x23], #0x8\n" + "ldr d30, [x22], #0x8\n" + "tbz x11, #1, 173f\n" + "ld1 { v20.s }[2], [x9], #0x4\n" + "ld1 { v22.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v24.s }[2], [x25], #0x4\n" + "ld1 { v26.s }[2], [x24], #0x4\n" + "ld1 { v28.s }[2], [x23], #0x4\n" + "ld1 { v30.s }[2], [x22], #0x4\n" + "tbz x11, #0, 176f\n" + "ld1 { v20.h }[6], [x9]\n" + "ld1 { v22.h }[6], [x26]\n" + "ld1 { v24.h }[6], [x25]\n" + "ld1 { v26.h }[6], [x24]\n" + "ld1 { v28.h }[6], [x23]\n" + "ld1 { v30.h }[6], [x22]\n" + "b 176f\n" + "173:" // Height 6: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 176f\n" + "ld1 { v20.h }[4], [x9]\n" + "ld1 { v22.h }[4], [x26]\n" + "ld1 { v24.h }[4], [x25]\n" + "ld1 { v26.h }[4], [x24]\n" + "ld1 { v28.h }[4], [x23]\n" + "ld1 { v30.h }[4], [x22]\n" + "b 176f\n" + "174:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 175f\n" + "ldr s20, [x9], #0x4\n" + "ldr s22, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s24, [x25], #0x4\n" + "ldr s26, [x24], #0x4\n" + "ldr s28, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "tbz x11, #0, 176f\n" + "ld1 { v20.h }[2], [x9]\n" + "ld1 { v22.h }[2], [x26]\n" + "ld1 { v24.h }[2], [x25]\n" + "ld1 { v26.h }[2], [x24]\n" + "ld1 { v28.h }[2], [x23]\n" + "ld1 { v30.h }[2], [x22]\n" + "b 176f\n" + "175:" // Height 6: Partial accumulate: partial_1_0 + "ldr h20, [x9, #0x0]\n" + "ldr h22, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h24, [x25, #0x0]\n" + "ldr h26, [x24, #0x0]\n" + "ldr h28, [x23, #0x0]\n" + "ldr h30, [x22, #0x0]\n" + "176:" // Height 6: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 179f\n" + "177:" // Height 6: full accumulate + "ldr q20, [x9, #0x0]\n" + "ldr q21, [x9, #0x10]\n" + "ldr q22, [x26, #0x0]\n" + "ldr q23, [x26, #0x10]\n" + "ldr q24, [x25, #0x0]\n" + "ldr q25, [x25, #0x10]\n" + "ldr q26, [x24, #0x0]\n" + "ldr q27, [x24, #0x10]\n" + "ldr q28, [x23, #0x0]\n" + "ldr q29, [x23, #0x10]\n" + "ldr q30, [x22, #0x0]\n" + "ldr q31, [x22, #0x10]\n" + "b 179f\n" + "178:" // Height 6: no accumulate + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "179:" // Height 6: setup done + "mov x28, #0x0\n" + "180:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 181f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" + "cbnz x28, 182f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "add x21, x21, x20, LSL #1\n" + "b 182f\n" + "181:" // Height 6: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" + "182:" // Height 6: input setup done + "cmp x27, #0x8\n" + "blt 185f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x21, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "ldr q8, [x10, #0x20]\n" + "ldr q9, [x10, #0x30]\n" + "ldr q10, [x10, #0x40]\n" + "ldr q11, [x10, #0x50]\n" + "ldr q12, [x10, #0x60]\n" + "ldr q13, [x10, #0x70]\n" + "ldr q14, [x10, #0x80]\n" + "ldr q15, [x10, #0x90]\n" + "ldr q16, [x10, #0xa0]\n" + "ldr q17, [x10, #0xb0]\n" + "ldr q18, [x10, #0xc0]\n" + "ldr q19, [x10, #0xd0]\n" + "blt 184f\n" + "183:" // Height 6: Multiply loop: Main loop head + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v28.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "add x23, x23, #0x10\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v29.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v28.8h, v8.8h, v4.h[1]\n" + "fmla v30.8h, v8.8h, v5.h[1]\n" + "ldr q8, [x10, #0x20]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "fmla v29.8h, v9.8h, v4.h[1]\n" + "fmla v31.8h, v9.8h, v5.h[1]\n" + "ldr q9, [x10, #0x30]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "fmla v28.8h, v10.8h, v4.h[2]\n" + "fmla v30.8h, v10.8h, v5.h[2]\n" + "ldr q10, [x10, #0x40]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "fmla v29.8h, v11.8h, v4.h[2]\n" + "fmla v31.8h, v11.8h, v5.h[2]\n" + "ldr q11, [x10, #0x50]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "fmla v28.8h, v12.8h, v4.h[3]\n" + "fmla v30.8h, v12.8h, v5.h[3]\n" + "ldr q12, [x10, #0x60]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "fmla v29.8h, v13.8h, v4.h[3]\n" + "fmla v31.8h, v13.8h, v5.h[3]\n" + "ldr q13, [x10, #0x70]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "fmla v28.8h, v14.8h, v4.h[4]\n" + "fmla v30.8h, v14.8h, v5.h[4]\n" + "ldr q14, [x10, #0x80]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "fmla v29.8h, v15.8h, v4.h[4]\n" + "fmla v31.8h, v15.8h, v5.h[4]\n" + "ldr q15, [x10, #0x90]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "fmla v28.8h, v16.8h, v4.h[5]\n" + "fmla v30.8h, v16.8h, v5.h[5]\n" + "ldr q16, [x10, #0xa0]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "fmla v29.8h, v17.8h, v4.h[5]\n" + "fmla v31.8h, v17.8h, v5.h[5]\n" + "ldr q17, [x10, #0xb0]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "fmla v28.8h, v18.8h, v4.h[6]\n" + "fmla v30.8h, v18.8h, v5.h[6]\n" + "ldr q18, [x10, #0xc0]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "fmla v29.8h, v19.8h, v4.h[6]\n" + "fmla v31.8h, v19.8h, v5.h[6]\n" + "ldr q19, [x10, #0xd0]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "fmla v28.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v29.8h, v7.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "ldr q5, [x21, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 183b\n" + "184:" // Height 6: Multiply loop: Single iteration only + "fmla v20.8h, v6.8h, v0.h[0]\n" + "fmla v22.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v24.8h, v6.8h, v2.h[0]\n" + "fmla v26.8h, v6.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v28.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0xe0]\n" + "add x22, x22, #0x10\n" + "fmla v21.8h, v7.8h, v0.h[0]\n" + "fmla v23.8h, v7.8h, v1.h[0]\n" + "add x21, x21, #0x10\n" + "sub x27, x27, #0x8\n" + "fmla v25.8h, v7.8h, v2.h[0]\n" + "fmla v27.8h, v7.8h, v3.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v29.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0xf0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v20.8h, v8.8h, v0.h[1]\n" + "fmla v22.8h, v8.8h, v1.h[1]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v24.8h, v8.8h, v2.h[1]\n" + "fmla v26.8h, v8.8h, v3.h[1]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "add x10, x10, #0x100\n" + "fmla v28.8h, v8.8h, v4.h[1]\n" + "fmla v30.8h, v8.8h, v5.h[1]\n" + "fmla v21.8h, v9.8h, v0.h[1]\n" + "fmla v23.8h, v9.8h, v1.h[1]\n" + "fmla v25.8h, v9.8h, v2.h[1]\n" + "fmla v27.8h, v9.8h, v3.h[1]\n" + "fmla v29.8h, v9.8h, v4.h[1]\n" + "fmla v31.8h, v9.8h, v5.h[1]\n" + "fmla v20.8h, v10.8h, v0.h[2]\n" + "fmla v22.8h, v10.8h, v1.h[2]\n" + "fmla v24.8h, v10.8h, v2.h[2]\n" + "fmla v26.8h, v10.8h, v3.h[2]\n" + "fmla v28.8h, v10.8h, v4.h[2]\n" + "fmla v30.8h, v10.8h, v5.h[2]\n" + "fmla v21.8h, v11.8h, v0.h[2]\n" + "fmla v23.8h, v11.8h, v1.h[2]\n" + "fmla v25.8h, v11.8h, v2.h[2]\n" + "fmla v27.8h, v11.8h, v3.h[2]\n" + "fmla v29.8h, v11.8h, v4.h[2]\n" + "fmla v31.8h, v11.8h, v5.h[2]\n" + "fmla v20.8h, v12.8h, v0.h[3]\n" + "fmla v22.8h, v12.8h, v1.h[3]\n" + "fmla v24.8h, v12.8h, v2.h[3]\n" + "fmla v26.8h, v12.8h, v3.h[3]\n" + "fmla v28.8h, v12.8h, v4.h[3]\n" + "fmla v30.8h, v12.8h, v5.h[3]\n" + "fmla v21.8h, v13.8h, v0.h[3]\n" + "fmla v23.8h, v13.8h, v1.h[3]\n" + "fmla v25.8h, v13.8h, v2.h[3]\n" + "fmla v27.8h, v13.8h, v3.h[3]\n" + "fmla v29.8h, v13.8h, v4.h[3]\n" + "fmla v31.8h, v13.8h, v5.h[3]\n" + "fmla v20.8h, v14.8h, v0.h[4]\n" + "fmla v22.8h, v14.8h, v1.h[4]\n" + "fmla v24.8h, v14.8h, v2.h[4]\n" + "fmla v26.8h, v14.8h, v3.h[4]\n" + "fmla v28.8h, v14.8h, v4.h[4]\n" + "fmla v30.8h, v14.8h, v5.h[4]\n" + "fmla v21.8h, v15.8h, v0.h[4]\n" + "fmla v23.8h, v15.8h, v1.h[4]\n" + "fmla v25.8h, v15.8h, v2.h[4]\n" + "fmla v27.8h, v15.8h, v3.h[4]\n" + "fmla v29.8h, v15.8h, v4.h[4]\n" + "fmla v31.8h, v15.8h, v5.h[4]\n" + "fmla v20.8h, v16.8h, v0.h[5]\n" + "fmla v22.8h, v16.8h, v1.h[5]\n" + "fmla v24.8h, v16.8h, v2.h[5]\n" + "fmla v26.8h, v16.8h, v3.h[5]\n" + "fmla v28.8h, v16.8h, v4.h[5]\n" + "fmla v30.8h, v16.8h, v5.h[5]\n" + "fmla v21.8h, v17.8h, v0.h[5]\n" + "fmla v23.8h, v17.8h, v1.h[5]\n" + "fmla v25.8h, v17.8h, v2.h[5]\n" + "fmla v27.8h, v17.8h, v3.h[5]\n" + "fmla v29.8h, v17.8h, v4.h[5]\n" + "fmla v31.8h, v17.8h, v5.h[5]\n" + "fmla v20.8h, v18.8h, v0.h[6]\n" + "fmla v22.8h, v18.8h, v1.h[6]\n" + "fmla v24.8h, v18.8h, v2.h[6]\n" + "fmla v26.8h, v18.8h, v3.h[6]\n" + "fmla v28.8h, v18.8h, v4.h[6]\n" + "fmla v30.8h, v18.8h, v5.h[6]\n" + "fmla v21.8h, v19.8h, v0.h[6]\n" + "fmla v23.8h, v19.8h, v1.h[6]\n" + "fmla v25.8h, v19.8h, v2.h[6]\n" + "fmla v27.8h, v19.8h, v3.h[6]\n" + "fmla v29.8h, v19.8h, v4.h[6]\n" + "fmla v31.8h, v19.8h, v5.h[6]\n" + "fmla v20.8h, v6.8h, v0.h[7]\n" + "fmla v22.8h, v6.8h, v1.h[7]\n" + "fmla v24.8h, v6.8h, v2.h[7]\n" + "fmla v26.8h, v6.8h, v3.h[7]\n" + "fmla v28.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "fmla v21.8h, v7.8h, v0.h[7]\n" + "fmla v23.8h, v7.8h, v1.h[7]\n" + "fmla v25.8h, v7.8h, v2.h[7]\n" + "fmla v27.8h, v7.8h, v3.h[7]\n" + "fmla v29.8h, v7.8h, v4.h[7]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "185:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 187f\n" + "186:" // Height 6: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "add x10, x10, #0x20\n" + "fmla v20.8h, v8.8h, v0.h[0]\n" + "fmla v22.8h, v8.8h, v1.h[0]\n" + "fmla v24.8h, v8.8h, v2.h[0]\n" + "fmla v26.8h, v8.8h, v3.h[0]\n" + "fmla v28.8h, v8.8h, v4.h[0]\n" + "fmla v30.8h, v8.8h, v5.h[0]\n" + "fmla v21.8h, v9.8h, v0.h[0]\n" + "fmla v23.8h, v9.8h, v1.h[0]\n" + "fmla v25.8h, v9.8h, v2.h[0]\n" + "fmla v27.8h, v9.8h, v3.h[0]\n" + "fmla v29.8h, v9.8h, v4.h[0]\n" + "fmla v31.8h, v9.8h, v5.h[0]\n" + "cbnz x27, 186b\n" + "187:" // Height 6: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 180b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 188f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v20.8h, v20.8h, v17.8h\n" + "fmin v21.8h, v21.8h, v17.8h\n" + "fmin v22.8h, v22.8h, v17.8h\n" + "fmin v23.8h, v23.8h, v17.8h\n" + "fmin v24.8h, v24.8h, v17.8h\n" + "fmin v25.8h, v25.8h, v17.8h\n" + "fmin v26.8h, v26.8h, v17.8h\n" + "fmin v27.8h, v27.8h, v17.8h\n" + "fmin v28.8h, v28.8h, v17.8h\n" + "fmin v29.8h, v29.8h, v17.8h\n" + "fmin v30.8h, v30.8h, v17.8h\n" + "fmin v31.8h, v31.8h, v17.8h\n" + "fmax v20.8h, v20.8h, v16.8h\n" + "fmax v21.8h, v21.8h, v16.8h\n" + "fmax v22.8h, v22.8h, v16.8h\n" + "fmax v23.8h, v23.8h, v16.8h\n" + "fmax v24.8h, v24.8h, v16.8h\n" + "fmax v25.8h, v25.8h, v16.8h\n" + "fmax v26.8h, v26.8h, v16.8h\n" + "fmax v27.8h, v27.8h, v16.8h\n" + "fmax v28.8h, v28.8h, v16.8h\n" + "fmax v29.8h, v29.8h, v16.8h\n" + "fmax v30.8h, v30.8h, v16.8h\n" + "fmax v31.8h, v31.8h, v16.8h\n" + "188:" // Height 6: No activation + "cmp x11, #0x10\n" + "bge 197f\n" + "tbz x11, #3, 192f\n" + "st1 { v20.8h }, [x9], #0x10\n" + "st1 { v22.8h }, [x26], #0x10\n" + "st1 { v24.8h }, [x25], #0x10\n" + "st1 { v26.8h }, [x24], #0x10\n" + "st1 { v28.8h }, [x23], #0x10\n" + "st1 { v30.8h }, [x22], #0x10\n" + "tbz x11, #2, 190f\n" + "str d21, [x9], #0x8\n" + "str d23, [x26], #0x8\n" + "str d25, [x25], #0x8\n" + "str d27, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "str d31, [x22], #0x8\n" + "tbz x11, #1, 189f\n" + "st1 { v21.s }[2], [x9], #0x4\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v25.s }[2], [x25], #0x4\n" + "st1 { v27.s }[2], [x24], #0x4\n" + "st1 { v29.s }[2], [x23], #0x4\n" + "st1 { v31.s }[2], [x22], #0x4\n" + "tbz x11, #0, 196f\n" + "st1 { v21.h }[6], [x9]\n" + "st1 { v23.h }[6], [x26]\n" + "st1 { v25.h }[6], [x25]\n" + "st1 { v27.h }[6], [x24]\n" + "st1 { v29.h }[6], [x23]\n" + "st1 { v31.h }[6], [x22]\n" + "b 196f\n" + "189:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 196f\n" + "st1 { v21.h }[4], [x9]\n" + "st1 { v23.h }[4], [x26]\n" + "st1 { v25.h }[4], [x25]\n" + "st1 { v27.h }[4], [x24]\n" + "st1 { v29.h }[4], [x23]\n" + "st1 { v31.h }[4], [x22]\n" + "b 196f\n" + "190:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 191f\n" + "str s21, [x9], #0x4\n" + "str s23, [x26], #0x4\n" + "str s25, [x25], #0x4\n" + "str s27, [x24], #0x4\n" + "str s29, [x23], #0x4\n" + "str s31, [x22], #0x4\n" + "tbz x11, #0, 196f\n" + "st1 { v21.h }[2], [x9]\n" + "st1 { v23.h }[2], [x26]\n" + "st1 { v25.h }[2], [x25]\n" + "st1 { v27.h }[2], [x24]\n" + "st1 { v29.h }[2], [x23]\n" + "st1 { v31.h }[2], [x22]\n" + "b 196f\n" + "191:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 196f\n" + "str h21, [x9, #0x0]\n" + "str h23, [x26, #0x0]\n" + "str h25, [x25, #0x0]\n" + "str h27, [x24, #0x0]\n" + "str h29, [x23, #0x0]\n" + "str h31, [x22, #0x0]\n" + "b 196f\n" + "192:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 194f\n" + "str d20, [x9], #0x8\n" + "str d22, [x26], #0x8\n" + "str d24, [x25], #0x8\n" + "str d26, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "str d30, [x22], #0x8\n" + "tbz x11, #1, 193f\n" + "st1 { v20.s }[2], [x9], #0x4\n" + "st1 { v22.s }[2], [x26], #0x4\n" + "st1 { v24.s }[2], [x25], #0x4\n" + "st1 { v26.s }[2], [x24], #0x4\n" + "st1 { v28.s }[2], [x23], #0x4\n" + "st1 { v30.s }[2], [x22], #0x4\n" + "tbz x11, #0, 196f\n" + "st1 { v20.h }[6], [x9]\n" + "st1 { v22.h }[6], [x26]\n" + "st1 { v24.h }[6], [x25]\n" + "st1 { v26.h }[6], [x24]\n" + "st1 { v28.h }[6], [x23]\n" + "st1 { v30.h }[6], [x22]\n" + "b 196f\n" + "193:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 196f\n" + "st1 { v20.h }[4], [x9]\n" + "st1 { v22.h }[4], [x26]\n" + "st1 { v24.h }[4], [x25]\n" + "st1 { v26.h }[4], [x24]\n" + "st1 { v28.h }[4], [x23]\n" + "st1 { v30.h }[4], [x22]\n" + "b 196f\n" + "194:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 195f\n" + "str s20, [x9], #0x4\n" + "str s22, [x26], #0x4\n" + "str s24, [x25], #0x4\n" + "str s26, [x24], #0x4\n" + "str s28, [x23], #0x4\n" + "str s30, [x22], #0x4\n" + "tbz x11, #0, 196f\n" + "st1 { v20.h }[2], [x9]\n" + "st1 { v22.h }[2], [x26]\n" + "st1 { v24.h }[2], [x25]\n" + "st1 { v26.h }[2], [x24]\n" + "st1 { v28.h }[2], [x23]\n" + "st1 { v30.h }[2], [x22]\n" + "b 196f\n" + "195:" // Height 6: Partial direct writeback: partial_1_0 + "str h20, [x9, #0x0]\n" + "str h22, [x26, #0x0]\n" + "str h24, [x25, #0x0]\n" + "str h26, [x24, #0x0]\n" + "str h28, [x23, #0x0]\n" + "str h30, [x22, #0x0]\n" + "196:" // Height 6: Partial direct writeback: Done + "b 198f\n" + "197:" // Height 6: Full writeback + "str q20, [x9, #0x0]\n" + "str q21, [x9, #0x10]\n" + "add x9, x9, #0x20\n" + "str q22, [x26, #0x0]\n" + "str q23, [x26, #0x10]\n" + "str q24, [x25, #0x0]\n" + "str q25, [x25, #0x10]\n" + "str q26, [x24, #0x0]\n" + "str q27, [x24, #0x10]\n" + "str q28, [x23, #0x0]\n" + "str q29, [x23, #0x10]\n" + "str q30, [x22, #0x0]\n" + "str q31, [x22, #0x10]\n" + "198:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 167b\n" + "subs %x[m], %x[m], #0x6\n" + "beq 200f\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 199f\n" + "add x21, x21, #0x6\n" + "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "199:" // Update direct input + "mov x20, #0xc\n" + "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" + "b 1b\n" + "200:" // Exit + : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) + : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), + [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), + [offsetof_N] "I"(offsetof(KernelArgs, N)), + [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), + [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), + [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), + [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), + [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), + [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} diff --git a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h b/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h new file mode 100644 index 00000000..01ad4d5c --- /dev/null +++ b/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h @@ -0,0 +1,110 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets `m_step` value. +/// +/// The starting row index must be divisible by `m_step`. +/// +/// @param m Total number of row. +/// +/// @return `m_step` value. +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m); + +/// Gets `n_step` value. +/// +/// The starting column index must be divisible by `n_step`. +/// +/// @param n Total number of column +/// +/// @return `n_step` value. +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n); + +/// Gets the default row stride in bytes of the LHS matrix. +/// +/// @param[in] k Number of columns. +/// +/// @return The default row stride in bytes of the LHS matrix. +size_t kai_get_lhs_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k); + +/// Gets the offset in bytes to the data element in the LHS matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] k_idx Column index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t k_idx, size_t stride); + +/// Gets the offset in bytes to the data element in the packed RHS matrix buffer. +/// +/// @param[in] k Number of columns. +/// @param[in] n_idx Row index. +/// @param[in] k_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k, size_t n_idx, size_t k_idx); + +/// Gets the default row stride in bytes of the destination matrix. +/// +/// @param[in] n Number of columns. +/// +/// @return The default row stride in bytes of the destination matrix. +size_t kai_get_dst_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n); + +/// Gets the offset in bytes to the data element in the destination matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] n_idx Column index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t n_idx, size_t stride); + +/// Gets the size in bytes of the destination matrix buffer. +/// +/// @param[in] m Number of rows. +/// @param[in] n Number of columns. +/// @param[in] stride Row stride in bytes. +/// +/// @return The size in bytes of the destination matrix buffer. +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m, size_t n, size_t stride); + +/// Runs the matrix multiplication microkernel followed by a clamp operation. +/// +/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset +/// calculated using the following functions: +/// +/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. +/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. +/// * Output: @ref kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. +/// +/// @param[in] m Number of output rows to be computed. +/// @param[in] n Number of output columns to be computed. +/// @param[in] k Common dimension of the LHS and RHS operand. +/// @param[in] lhs LHS matrix buffer. +/// @param[in] packed_rhs Packed RHS buffer. +/// @param[in] dst Output matrix buffer. +/// @param[in] lhs_stride Row stride in bytes of the LHS matrix. +/// @param[in] dst_stride Row stride in bytes of the output matrix. +/// @param[in] clamp_min Minimum value to clamp the final result. +/// @param[in] clamp_max Maximum value to clamp the final result. +void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( + size_t m, size_t n, size_t k, // + const void* lhs, const void* packed_rhs, void* dst, // + size_t lhs_stride, size_t dst_stride, // + __fp16 clamp_min, __fp16 clamp_max); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c new file mode 100644 index 00000000..ea63cffa --- /dev/null +++ b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c @@ -0,0 +1,3704 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +typedef bfloat16_t bfloat16; + +#include "kai_common.h" + +static const size_t block_height = 6; +static const size_t block_width = 16; + +size_t kai_get_m_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m) { + KAI_UNUSED(m); + + return 6; +} + +size_t kai_get_n_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n) { + KAI_UNUSED(n); + + return 16; +} + +size_t kai_get_lhs_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t k) { + return k * sizeof(bfloat16); +} + +size_t kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t m_idx, size_t k_idx, size_t stride) { + KAI_ASSUME(m_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + + return m_idx * stride; +} + +size_t kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t k, size_t n_idx, size_t k_idx) { + KAI_ASSUME(n_idx % block_width == 0); + KAI_ASSUME(k_idx == 0); + + return n_idx / block_width * (block_width * sizeof(float) + block_width * k * sizeof(bfloat16)); +} + +size_t kai_get_dst_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n) { + return n * sizeof(float); +} + +size_t kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t m_idx, size_t n_idx, size_t stride) { + KAI_ASSUME(m_idx % block_height == 0); + KAI_ASSUME(n_idx % block_width == 0); + + return m_idx * stride + n_idx * sizeof(float); +} + +size_t kai_get_dst_size_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m, size_t n, size_t stride) { + return m * stride + n * sizeof(float); +} + +void kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t m, size_t n, size_t k, // + const void* lhs, const void* packed_rhs, void* dst, // + size_t lhs_stride, size_t dst_stride, // + float clamp_min, float clamp_max) { + typedef struct { + float maxval; + float minval; + unsigned int num_strings; + const unsigned int* string_lengths; + size_t N; + const void* B_ptr; + size_t output_offset; + size_t input_initial_col; + size_t input_offset; + void* output_ptr; + const void* bias; + } KernelArgs; + + KernelArgs ka; + + unsigned long flags = 0; + + unsigned int string_length = k; + ka.num_strings = 1; + ka.string_lengths = &string_length; + ka.N = n; + ka.B_ptr = packed_rhs; + ka.bias = NULL; + + // Direct input. + const void* input_ptr = lhs; + ka.input_offset = lhs_stride / sizeof(bfloat16); + ka.input_initial_col = 0; + + // Direct output. + ka.output_ptr = dst; + ka.output_offset = dst_stride / sizeof(float); + + // Clamping output. + flags |= 0x2; + ka.maxval = clamp_max; + ka.minval = clamp_min; + + __asm__ __volatile__( + "1:" // Row loop + "cmp %x[m], #0x6\n" + "bge 186f\n" + "cmp %x[m], #0x4\n" + "bgt 149f\n" + "beq 112f\n" + "cmp %x[m], #0x2\n" + "bgt 75f\n" + "beq 38f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "2:" // Height 1: Column loop + "cbz x10, 3f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 15f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 14f\n" + "cmp x11, #0x10\n" + "bge 12f\n" + "tbz x11, #3, 7f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "tbz x11, #2, 5f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "tbz x11, #1, 4f\n" + "ldr d16, [x9], #0x8\n" + "mov x20, #0x38\n" + "tbz x11, #0, 11f\n" + "ld1 { v16.s }[2], [x9]\n" + "b 11f\n" + "4:" // Height 1: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 11f\n" + "ldr s16, [x9, #0x0]\n" + "b 11f\n" + "5:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 6f\n" + "ldr d11, [x9], #0x8\n" + "mov x20, #0x28\n" + "tbz x11, #0, 11f\n" + "ld1 { v11.s }[2], [x9]\n" + "b 11f\n" + "6:" // Height 1: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 11f\n" + "ldr s11, [x9, #0x0]\n" + "b 11f\n" + "7:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 9f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "tbz x11, #1, 8f\n" + "ldr d10, [x9], #0x8\n" + "mov x20, #0x18\n" + "tbz x11, #0, 11f\n" + "ld1 { v10.s }[2], [x9]\n" + "b 11f\n" + "8:" // Height 1: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 11f\n" + "ldr s10, [x9, #0x0]\n" + "b 11f\n" + "9:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 10f\n" + "ldr d9, [x9], #0x8\n" + "mov x20, #0x8\n" + "tbz x11, #0, 11f\n" + "ld1 { v9.s }[2], [x9]\n" + "b 11f\n" + "10:" // Height 1: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "mov x20, #0x0\n" + "11:" // Height 1: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 13f\n" + "12:" // Height 1: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "13:" // Height 1: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 15f\n" + "14:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "15:" // Height 1: setup done + "mov x28, #0x0\n" + "16:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 17f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 18f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "b 18f\n" + "17:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "18:" // Height 1: input setup done + "cmp x27, #0x8\n" + "blt 21f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q7, [x10, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q6, [x10, #0x10]\n" + "blt 20f\n" + "19:" // Height 1: Multiply loop: Main loop head + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "bge 19b\n" + "20:" // Height 1: Multiply loop: Single iteration only + "add x26, x26, #0x10\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "21:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 26f\n" + "cmp x27, #0x4\n" + "blt 23f\n" + "22:" // Height 1: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "sub x27, x27, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "cmp x27, #0x4\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "bge 22b\n" + "23:" // Height 1: Multiply loop: Skip odd blocks + "cbz x27, 26f\n" + "tbz x27, #1, 24f\n" + "ldr s1, [x26], #0x4\n" + "tbz x27, #0, 25f\n" + "ld1 { v1.h }[2], [x26]\n" + "b 25f\n" + "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "25:" // Height 1: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "26:" // Height 1: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 16b\n" + "uzp1 v8.2d, v8.2d, v12.2d\n" + "uzp1 v9.2d, v9.2d, v13.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v10.2d, v10.2d, v14.2d\n" + "uzp1 v11.2d, v11.2d, v15.2d\n" + "tbz %x[flags], #1, 27f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v18.4s }, [x21]\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" + "27:" // Height 1: No activation + "cmp x11, #0x10\n" + "bge 36f\n" + "tbz x11, #3, 31f\n" + "st1 { v8.4s }, [x9], #0x10\n" + "st1 { v9.4s }, [x9], #0x10\n" + "tbz x11, #2, 29f\n" + "st1 { v10.4s }, [x9], #0x10\n" + "tbz x11, #1, 28f\n" + "str d11, [x9], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v11.s }[2], [x9]\n" + "b 35f\n" + "28:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 35f\n" + "str s11, [x9, #0x0]\n" + "b 35f\n" + "29:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 30f\n" + "str d10, [x9], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v10.s }[2], [x9]\n" + "b 35f\n" + "30:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 35f\n" + "str s10, [x9, #0x0]\n" + "b 35f\n" + "31:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 33f\n" + "st1 { v8.4s }, [x9], #0x10\n" + "tbz x11, #1, 32f\n" + "str d9, [x9], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v9.s }[2], [x9]\n" + "b 35f\n" + "32:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 35f\n" + "str s9, [x9, #0x0]\n" + "b 35f\n" + "33:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 34f\n" + "str d8, [x9], #0x8\n" + "tbz x11, #0, 35f\n" + "st1 { v8.s }[2], [x9]\n" + "b 35f\n" + "34:" // Height 1: Partial direct writeback: partial_1_0 + "str s8, [x9, #0x0]\n" + "35:" // Height 1: Partial direct writeback: Done + "b 37f\n" + "36:" // Height 1: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "37:" // Height 1: Writeback done + "subs x11, x11, #0x10\n" + "bgt 2b\n" + "b 224f\n" + "38:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "39:" // Height 2: Column loop + "cbz x10, 40f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "b 52f\n" + "40:" // Height 2: no bias + "tbz %x[flags], #0, 51f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #2\n" + "bge 49f\n" + "tbz x11, #3, 44f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "tbz x11, #2, 42f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "ld1 { v14.4s }, [x26], #0x10\n" + "tbz x11, #1, 41f\n" + "ldr d16, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "mov x20, #0x38\n" + "tbz x11, #0, 48f\n" + "ld1 { v16.s }[2], [x9]\n" + "ld1 { v15.s }[2], [x26]\n" + "b 48f\n" + "41:" // Height 2: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 48f\n" + "ldr s16, [x9, #0x0]\n" + "ldr s15, [x26, #0x0]\n" + "b 48f\n" + "42:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 43f\n" + "ldr d11, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "mov x20, #0x28\n" + "tbz x11, #0, 48f\n" + "ld1 { v11.s }[2], [x9]\n" + "ld1 { v14.s }[2], [x26]\n" + "b 48f\n" + "43:" // Height 2: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 48f\n" + "ldr s11, [x9, #0x0]\n" + "ldr s14, [x26, #0x0]\n" + "b 48f\n" + "44:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 46f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "tbz x11, #1, 45f\n" + "ldr d10, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "mov x20, #0x18\n" + "tbz x11, #0, 48f\n" + "ld1 { v10.s }[2], [x9]\n" + "ld1 { v13.s }[2], [x26]\n" + "b 48f\n" + "45:" // Height 2: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 48f\n" + "ldr s10, [x9, #0x0]\n" + "ldr s13, [x26, #0x0]\n" + "b 48f\n" + "46:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 47f\n" + "ldr d9, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "mov x20, #0x8\n" + "tbz x11, #0, 48f\n" + "ld1 { v9.s }[2], [x9]\n" + "ld1 { v12.s }[2], [x26]\n" + "b 48f\n" + "47:" // Height 2: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "ldr s12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "48:" // Height 2: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 50f\n" + "49:" // Height 2: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "50:" // Height 2: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "b 52f\n" + "51:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "52:" // Height 2: setup done + "mov x28, #0x0\n" + "53:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 54f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "cbnz x28, 55f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "b 55f\n" + "54:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "55:" // Height 2: input setup done + "cmp x27, #0x8\n" + "blt 58f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "blt 57f\n" + "56:" // Height 2: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "ldr q2, [x25, #0x0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "bge 56b\n" + "57:" // Height 2: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "58:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 63f\n" + "cmp x27, #0x4\n" + "blt 60f\n" + "59:" // Height 2: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" + "sub x27, x27, #0x4\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "cmp x27, #0x4\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + "bge 59b\n" + "60:" // Height 2: Multiply loop: Skip odd blocks + "cbz x27, 63f\n" + "tbz x27, #1, 61f\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "tbz x27, #0, 62f\n" + "ld1 { v1.h }[2], [x26]\n" + "ld1 { v2.h }[2], [x25]\n" + "b 62f\n" + "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "ldr h2, [x25, #0x0]\n" + "62:" // Height 2: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + "63:" // Height 2: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 53b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "add x26, x9, x20, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 64f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v18.4s }, [x21]\n" + "ld1r { v17.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v18.4s\n" + "fmin v12.4s, v12.4s, v18.4s\n" + "fmin v13.4s, v13.4s, v18.4s\n" + "fmin v14.4s, v14.4s, v18.4s\n" + "fmin v8.4s, v8.4s, v18.4s\n" + "fmin v9.4s, v9.4s, v18.4s\n" + "fmin v10.4s, v10.4s, v18.4s\n" + "fmin v11.4s, v11.4s, v18.4s\n" + "fmax v7.4s, v7.4s, v17.4s\n" + "fmax v12.4s, v12.4s, v17.4s\n" + "fmax v13.4s, v13.4s, v17.4s\n" + "fmax v14.4s, v14.4s, v17.4s\n" + "fmax v8.4s, v8.4s, v17.4s\n" + "fmax v9.4s, v9.4s, v17.4s\n" + "fmax v10.4s, v10.4s, v17.4s\n" + "fmax v11.4s, v11.4s, v17.4s\n" + "64:" // Height 2: No activation + "cmp x11, #0x10\n" + "bge 73f\n" + "tbz x11, #3, 68f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "tbz x11, #2, 66f\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "tbz x11, #1, 65f\n" + "str d14, [x9], #0x8\n" + "str d11, [x26], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v11.s }[2], [x26]\n" + "b 72f\n" + "65:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 72f\n" + "str s14, [x9, #0x0]\n" + "str s11, [x26, #0x0]\n" + "b 72f\n" + "66:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 67f\n" + "str d13, [x9], #0x8\n" + "str d10, [x26], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v10.s }[2], [x26]\n" + "b 72f\n" + "67:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 72f\n" + "str s13, [x9, #0x0]\n" + "str s10, [x26, #0x0]\n" + "b 72f\n" + "68:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 70f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "tbz x11, #1, 69f\n" + "str d12, [x9], #0x8\n" + "str d9, [x26], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v9.s }[2], [x26]\n" + "b 72f\n" + "69:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 72f\n" + "str s12, [x9, #0x0]\n" + "str s9, [x26, #0x0]\n" + "b 72f\n" + "70:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 71f\n" + "str d7, [x9], #0x8\n" + "str d8, [x26], #0x8\n" + "tbz x11, #0, 72f\n" + "st1 { v7.s }[2], [x9]\n" + "st1 { v8.s }[2], [x26]\n" + "b 72f\n" + "71:" // Height 2: Partial direct writeback: partial_1_0 + "str s7, [x9, #0x0]\n" + "str s8, [x26, #0x0]\n" + "72:" // Height 2: Partial direct writeback: Done + "b 74f\n" + "73:" // Height 2: Full writeback + "str q7, [x9, #0x0]\n" + "str q12, [x9, #0x10]\n" + "str q13, [x9, #0x20]\n" + "str q14, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "74:" // Height 2: Writeback done + "subs x11, x11, #0x10\n" + "bgt 39b\n" + "b 224f\n" + "75:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "76:" // Height 3: Column loop + "cbz x10, 77f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v16.16b, v8.16b\n" + "mov v20.16b, v12.16b\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 89f\n" + "77:" // Height 3: no bias + "tbz %x[flags], #0, 88f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "bge 86f\n" + "tbz x11, #3, 81f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x25], #0x10\n" + "tbz x11, #2, 79f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "ld1 { v14.4s }, [x26], #0x10\n" + "ld1 { v19.4s }, [x25], #0x10\n" + "tbz x11, #1, 78f\n" + "ldr d16, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "mov x20, #0x38\n" + "ldr d24, [x25], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v16.s }[2], [x9]\n" + "ld1 { v15.s }[2], [x26]\n" + "ld1 { v24.s }[2], [x25]\n" + "b 85f\n" + "78:" // Height 3: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 85f\n" + "ldr s16, [x9, #0x0]\n" + "ldr s15, [x26, #0x0]\n" + "ldr s24, [x25, #0x0]\n" + "b 85f\n" + "79:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 80f\n" + "ldr d11, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "mov x20, #0x28\n" + "ldr d19, [x25], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v11.s }[2], [x9]\n" + "ld1 { v14.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x25]\n" + "b 85f\n" + "80:" // Height 3: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 85f\n" + "ldr s11, [x9, #0x0]\n" + "ldr s14, [x26, #0x0]\n" + "ldr s19, [x25, #0x0]\n" + "b 85f\n" + "81:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 83f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "tbz x11, #1, 82f\n" + "ldr d10, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "mov x20, #0x18\n" + "ldr d18, [x25], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v10.s }[2], [x9]\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x25]\n" + "b 85f\n" + "82:" // Height 3: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 85f\n" + "ldr s10, [x9, #0x0]\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x25, #0x0]\n" + "b 85f\n" + "83:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 84f\n" + "ldr d9, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "mov x20, #0x8\n" + "ldr d17, [x25], #0x8\n" + "tbz x11, #0, 85f\n" + "ld1 { v9.s }[2], [x9]\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x25]\n" + "b 85f\n" + "84:" // Height 3: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "ldr s12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr s17, [x25, #0x0]\n" + "85:" // Height 3: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 87f\n" + "86:" // Height 3: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q17, [x25, #0x0]\n" + "ldr q18, [x25, #0x10]\n" + "ldr q19, [x25, #0x20]\n" + "ldr q24, [x25, #0x30]\n" + "87:" // Height 3: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 89f\n" + "88:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "89:" // Height 3: setup done + "mov x28, #0x0\n" + "90:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 91f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x28, 92f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "b 92f\n" + "91:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "92:" // Height 3: input setup done + "cmp x27, #0x8\n" + "blt 95f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "blt 94f\n" + "93:" // Height 3: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + "ldr q2, [x25, #0x0]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "bge 93b\n" + "94:" // Height 3: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "95:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 100f\n" + "cmp x27, #0x4\n" + "blt 97f\n" + "96:" // Height 3: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" + "sub x27, x27, #0x4\n" + "ldr d3, [x24], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "cmp x27, #0x4\n" + "ldr q7, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 96b\n" + "97:" // Height 3: Multiply loop: Skip odd blocks + "cbz x27, 100f\n" + "tbz x27, #1, 98f\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "tbz x27, #0, 99f\n" + "ld1 { v1.h }[2], [x26]\n" + "ld1 { v2.h }[2], [x25]\n" + "ld1 { v3.h }[2], [x24]\n" + "b 99f\n" + "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "ldr h2, [x25, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "99:" // Height 3: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "100:" // Height 3: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 90b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "prfm pstl1keep, [x25, #0x0]\n" + "uzp1 v16.2d, v16.2d, v20.2d\n" + "uzp1 v17.2d, v17.2d, v21.2d\n" + "uzp1 v18.2d, v18.2d, v22.2d\n" + "uzp1 v19.2d, v19.2d, v23.2d\n" + "tbz %x[flags], #1, 101f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v26.4s }, [x21]\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" + "101:" // Height 3: No activation + "cmp x11, #0x10\n" + "bge 110f\n" + "tbz x11, #3, 105f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x25], #0x10\n" + "st1 { v17.4s }, [x25], #0x10\n" + "tbz x11, #2, 103f\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v18.4s }, [x25], #0x10\n" + "tbz x11, #1, 102f\n" + "str d14, [x9], #0x8\n" + "str d11, [x26], #0x8\n" + "str d19, [x25], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v19.s }[2], [x25]\n" + "b 109f\n" + "102:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 109f\n" + "str s14, [x9, #0x0]\n" + "str s11, [x26, #0x0]\n" + "str s19, [x25, #0x0]\n" + "b 109f\n" + "103:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 104f\n" + "str d13, [x9], #0x8\n" + "str d10, [x26], #0x8\n" + "str d18, [x25], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v18.s }[2], [x25]\n" + "b 109f\n" + "104:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 109f\n" + "str s13, [x9, #0x0]\n" + "str s10, [x26, #0x0]\n" + "str s18, [x25, #0x0]\n" + "b 109f\n" + "105:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 107f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v16.4s }, [x25], #0x10\n" + "tbz x11, #1, 106f\n" + "str d12, [x9], #0x8\n" + "str d9, [x26], #0x8\n" + "str d17, [x25], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v17.s }[2], [x25]\n" + "b 109f\n" + "106:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 109f\n" + "str s12, [x9, #0x0]\n" + "str s9, [x26, #0x0]\n" + "str s17, [x25, #0x0]\n" + "b 109f\n" + "107:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 108f\n" + "str d7, [x9], #0x8\n" + "str d8, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "tbz x11, #0, 109f\n" + "st1 { v7.s }[2], [x9]\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v16.s }[2], [x25]\n" + "b 109f\n" + "108:" // Height 3: Partial direct writeback: partial_1_0 + "str s7, [x9, #0x0]\n" + "str s8, [x26, #0x0]\n" + "str s16, [x25, #0x0]\n" + "109:" // Height 3: Partial direct writeback: Done + "b 111f\n" + "110:" // Height 3: Full writeback + "str q7, [x9, #0x0]\n" + "str q12, [x9, #0x10]\n" + "str q13, [x9, #0x20]\n" + "str q14, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q16, [x25, #0x0]\n" + "str q17, [x25, #0x10]\n" + "str q18, [x25, #0x20]\n" + "str q19, [x25, #0x30]\n" + "111:" // Height 3: Writeback done + "subs x11, x11, #0x10\n" + "bgt 76b\n" + "b 224f\n" + "112:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "113:" // Height 4: Column loop + "cbz x10, 114f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v16.16b, v8.16b\n" + "mov v20.16b, v12.16b\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "b 126f\n" + "114:" // Height 4: no bias + "tbz %x[flags], #0, 125f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "bge 123f\n" + "tbz x11, #3, 118f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x25], #0x10\n" + "ld1 { v21.4s }, [x24], #0x10\n" + "tbz x11, #2, 116f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "ld1 { v14.4s }, [x26], #0x10\n" + "ld1 { v19.4s }, [x25], #0x10\n" + "ld1 { v22.4s }, [x24], #0x10\n" + "tbz x11, #1, 115f\n" + "ldr d16, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "mov x20, #0x38\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v16.s }[2], [x9]\n" + "ld1 { v15.s }[2], [x26]\n" + "ld1 { v24.s }[2], [x25]\n" + "ld1 { v23.s }[2], [x24]\n" + "b 122f\n" + "115:" // Height 4: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 122f\n" + "ldr s16, [x9, #0x0]\n" + "ldr s15, [x26, #0x0]\n" + "ldr s24, [x25, #0x0]\n" + "ldr s23, [x24, #0x0]\n" + "b 122f\n" + "116:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 117f\n" + "ldr d11, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "mov x20, #0x28\n" + "ldr d19, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v11.s }[2], [x9]\n" + "ld1 { v14.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x25]\n" + "ld1 { v22.s }[2], [x24]\n" + "b 122f\n" + "117:" // Height 4: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 122f\n" + "ldr s11, [x9, #0x0]\n" + "ldr s14, [x26, #0x0]\n" + "ldr s19, [x25, #0x0]\n" + "ldr s22, [x24, #0x0]\n" + "b 122f\n" + "118:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 120f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "tbz x11, #1, 119f\n" + "ldr d10, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "mov x20, #0x18\n" + "ldr d18, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v10.s }[2], [x9]\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "b 122f\n" + "119:" // Height 4: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 122f\n" + "ldr s10, [x9, #0x0]\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "b 122f\n" + "120:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 121f\n" + "ldr d9, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "mov x20, #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "tbz x11, #0, 122f\n" + "ld1 { v9.s }[2], [x9]\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x25]\n" + "ld1 { v20.s }[2], [x24]\n" + "b 122f\n" + "121:" // Height 4: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "ldr s12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr s17, [x25, #0x0]\n" + "ldr s20, [x24, #0x0]\n" + "122:" // Height 4: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 124f\n" + "123:" // Height 4: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q17, [x25, #0x0]\n" + "ldr q18, [x25, #0x10]\n" + "ldr q19, [x25, #0x20]\n" + "ldr q24, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "124:" // Height 4: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "b 126f\n" + "125:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "126:" // Height 4: setup done + "mov x28, #0x0\n" + "127:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 128f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x28, 129f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "b 129f\n" + "128:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "129:" // Height 4: input setup done + "cmp x27, #0x8\n" + "blt 132f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "blt 131f\n" + "130:" // Height 4: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "ldr q4, [x23, #0x0]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + "ldr q2, [x25, #0x0]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "ldr q3, [x24, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "bge 130b\n" + "131:" // Height 4: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "sub x27, x27, #0x8\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "132:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 137f\n" + "cmp x27, #0x4\n" + "blt 134f\n" + "133:" // Height 4: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" + "sub x27, x27, #0x4\n" + "ldr d3, [x24], #0x8\n" + "ldr d4, [x23], #0x8\n" + "cmp x27, #0x4\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + "bge 133b\n" + "134:" // Height 4: Multiply loop: Skip odd blocks + "cbz x27, 137f\n" + "tbz x27, #1, 135f\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "tbz x27, #0, 136f\n" + "ld1 { v1.h }[2], [x26]\n" + "ld1 { v2.h }[2], [x25]\n" + "ld1 { v3.h }[2], [x24]\n" + "ld1 { v4.h }[2], [x23]\n" + "b 136f\n" + "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "ldr h2, [x25, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "ldr h4, [x23, #0x0]\n" + "136:" // Height 4: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "ldr q6, [x10, #0x10]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "137:" // Height 4: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 127b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "tbz %x[flags], #1, 138f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v26.4s }, [x21]\n" + "ld1r { v25.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v26.4s\n" + "fmin v12.4s, v12.4s, v26.4s\n" + "fmin v13.4s, v13.4s, v26.4s\n" + "fmin v14.4s, v14.4s, v26.4s\n" + "fmin v8.4s, v8.4s, v26.4s\n" + "fmin v9.4s, v9.4s, v26.4s\n" + "fmin v10.4s, v10.4s, v26.4s\n" + "fmin v11.4s, v11.4s, v26.4s\n" + "fmin v15.4s, v15.4s, v26.4s\n" + "fmin v20.4s, v20.4s, v26.4s\n" + "fmin v21.4s, v21.4s, v26.4s\n" + "fmin v22.4s, v22.4s, v26.4s\n" + "fmin v16.4s, v16.4s, v26.4s\n" + "fmin v17.4s, v17.4s, v26.4s\n" + "fmin v18.4s, v18.4s, v26.4s\n" + "fmin v19.4s, v19.4s, v26.4s\n" + "fmax v7.4s, v7.4s, v25.4s\n" + "fmax v12.4s, v12.4s, v25.4s\n" + "fmax v13.4s, v13.4s, v25.4s\n" + "fmax v14.4s, v14.4s, v25.4s\n" + "fmax v8.4s, v8.4s, v25.4s\n" + "fmax v9.4s, v9.4s, v25.4s\n" + "fmax v10.4s, v10.4s, v25.4s\n" + "fmax v11.4s, v11.4s, v25.4s\n" + "fmax v15.4s, v15.4s, v25.4s\n" + "fmax v20.4s, v20.4s, v25.4s\n" + "fmax v21.4s, v21.4s, v25.4s\n" + "fmax v22.4s, v22.4s, v25.4s\n" + "fmax v16.4s, v16.4s, v25.4s\n" + "fmax v17.4s, v17.4s, v25.4s\n" + "fmax v18.4s, v18.4s, v25.4s\n" + "fmax v19.4s, v19.4s, v25.4s\n" + "138:" // Height 4: No activation + "cmp x11, #0x10\n" + "bge 147f\n" + "tbz x11, #3, 142f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "tbz x11, #2, 140f\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "tbz x11, #1, 139f\n" + "str d14, [x9], #0x8\n" + "str d11, [x26], #0x8\n" + "str d22, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "b 146f\n" + "139:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 146f\n" + "str s14, [x9, #0x0]\n" + "str s11, [x26, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "b 146f\n" + "140:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 141f\n" + "str d13, [x9], #0x8\n" + "str d10, [x26], #0x8\n" + "str d21, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "b 146f\n" + "141:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 146f\n" + "str s13, [x9, #0x0]\n" + "str s10, [x26, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "b 146f\n" + "142:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 144f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "tbz x11, #1, 143f\n" + "str d12, [x9], #0x8\n" + "str d9, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "b 146f\n" + "143:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 146f\n" + "str s12, [x9, #0x0]\n" + "str s9, [x26, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "b 146f\n" + "144:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 145f\n" + "str d7, [x9], #0x8\n" + "str d8, [x26], #0x8\n" + "str d15, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "tbz x11, #0, 146f\n" + "st1 { v7.s }[2], [x9]\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "b 146f\n" + "145:" // Height 4: Partial direct writeback: partial_1_0 + "str s7, [x9, #0x0]\n" + "str s8, [x26, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "146:" // Height 4: Partial direct writeback: Done + "b 148f\n" + "147:" // Height 4: Full writeback + "str q7, [x9, #0x0]\n" + "str q12, [x9, #0x10]\n" + "str q13, [x9, #0x20]\n" + "str q14, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q15, [x25, #0x0]\n" + "str q20, [x25, #0x10]\n" + "str q21, [x25, #0x20]\n" + "str q22, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "148:" // Height 4: Writeback done + "subs x11, x11, #0x10\n" + "bgt 113b\n" + "b 224f\n" + "149:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "150:" // Height 5: Column loop + "cbz x10, 151f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v16.16b, v8.16b\n" + "mov v20.16b, v12.16b\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v24.16b, v8.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 163f\n" + "151:" // Height 5: no bias + "tbz %x[flags], #0, 162f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "add x23, x24, x20, LSL #2\n" + "bge 160f\n" + "tbz x11, #3, 155f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x25], #0x10\n" + "ld1 { v21.4s }, [x24], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "tbz x11, #2, 153f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "ld1 { v14.4s }, [x26], #0x10\n" + "ld1 { v19.4s }, [x25], #0x10\n" + "ld1 { v22.4s }, [x24], #0x10\n" + "ld1 { v27.4s }, [x23], #0x10\n" + "tbz x11, #1, 152f\n" + "ldr d16, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "mov x20, #0x38\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d6, [x23], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v16.s }[2], [x9]\n" + "ld1 { v15.s }[2], [x26]\n" + "ld1 { v24.s }[2], [x25]\n" + "ld1 { v23.s }[2], [x24]\n" + "ld1 { v6.s }[2], [x23]\n" + "b 159f\n" + "152:" // Height 5: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 159f\n" + "ldr s16, [x9, #0x0]\n" + "ldr s15, [x26, #0x0]\n" + "ldr s24, [x25, #0x0]\n" + "ldr s23, [x24, #0x0]\n" + "ldr s6, [x23, #0x0]\n" + "b 159f\n" + "153:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 154f\n" + "ldr d11, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "mov x20, #0x28\n" + "ldr d19, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v11.s }[2], [x9]\n" + "ld1 { v14.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x25]\n" + "ld1 { v22.s }[2], [x24]\n" + "ld1 { v27.s }[2], [x23]\n" + "b 159f\n" + "154:" // Height 5: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 159f\n" + "ldr s11, [x9, #0x0]\n" + "ldr s14, [x26, #0x0]\n" + "ldr s19, [x25, #0x0]\n" + "ldr s22, [x24, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "b 159f\n" + "155:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 157f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "tbz x11, #1, 156f\n" + "ldr d10, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "mov x20, #0x18\n" + "ldr d18, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v10.s }[2], [x9]\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v26.s }[2], [x23]\n" + "b 159f\n" + "156:" // Height 5: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 159f\n" + "ldr s10, [x9, #0x0]\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "b 159f\n" + "157:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 158f\n" + "ldr d9, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "mov x20, #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x11, #0, 159f\n" + "ld1 { v9.s }[2], [x9]\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x25]\n" + "ld1 { v20.s }[2], [x24]\n" + "ld1 { v25.s }[2], [x23]\n" + "b 159f\n" + "158:" // Height 5: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "ldr s12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr s17, [x25, #0x0]\n" + "ldr s20, [x24, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "159:" // Height 5: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 161f\n" + "160:" // Height 5: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q17, [x25, #0x0]\n" + "ldr q18, [x25, #0x10]\n" + "ldr q19, [x25, #0x20]\n" + "ldr q24, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "ldr q25, [x23, #0x0]\n" + "ldr q26, [x23, #0x10]\n" + "ldr q27, [x23, #0x20]\n" + "ldr q6, [x23, #0x30]\n" + "161:" // Height 5: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 163f\n" + "162:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "163:" // Height 5: setup done + "mov x28, #0x0\n" + "164:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 165f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 166f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "b 166f\n" + "165:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "166:" // Height 5: input setup done + "cmp x27, #0x8\n" + "blt 169f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q5, [x22, #0x0]\n" + "ldr q7, [x10, #0x0]\n" + "blt 168f\n" + "167:" // Height 5: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "sub x27, x27, #0x8\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q2, [x25, #0x0]\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + "ldr q4, [x23, #0x0]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "ldr q3, [x24, #0x0]\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "ldr q5, [x22, #0x0]\n" + "bge 167b\n" + "168:" // Height 5: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x26, x26, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "add x22, x22, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "169:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 174f\n" + "cmp x27, #0x4\n" + "blt 171f\n" + "170:" // Height 5: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" + "sub x27, x27, #0x4\n" + "ldr d3, [x24], #0x8\n" + "ldr d4, [x23], #0x8\n" + "cmp x27, #0x4\n" + "ldr d5, [x22], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 170b\n" + "171:" // Height 5: Multiply loop: Skip odd blocks + "cbz x27, 174f\n" + "tbz x27, #1, 172f\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s5, [x22], #0x4\n" + "tbz x27, #0, 173f\n" + "ld1 { v1.h }[2], [x26]\n" + "ld1 { v2.h }[2], [x25]\n" + "ld1 { v3.h }[2], [x24]\n" + "ld1 { v4.h }[2], [x23]\n" + "ld1 { v5.h }[2], [x22]\n" + "b 173f\n" + "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "ldr h2, [x25, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "ldr h4, [x23, #0x0]\n" + "ldr h5, [x22, #0x0]\n" + "173:" // Height 5: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "174:" // Height 5: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 164b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x23, x24, x20, LSL #2\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "prfm pstl1keep, [x23, #0x0]\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v24.2d, v24.2d, v28.2d\n" + "uzp1 v25.2d, v25.2d, v29.2d\n" + "uzp1 v26.2d, v26.2d, v30.2d\n" + "uzp1 v27.2d, v27.2d, v31.2d\n" + "tbz %x[flags], #1, 175f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x21]\n" + "ld1r { v0.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v1.4s\n" + "fmin v13.4s, v13.4s, v1.4s\n" + "fmin v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v1.4s\n" + "fmin v9.4s, v9.4s, v1.4s\n" + "fmin v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v1.4s\n" + "fmin v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v1.4s\n" + "fmin v22.4s, v22.4s, v1.4s\n" + "fmin v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v1.4s\n" + "fmin v19.4s, v19.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v1.4s\n" + "fmin v25.4s, v25.4s, v1.4s\n" + "fmin v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v0.4s\n" + "fmax v13.4s, v13.4s, v0.4s\n" + "fmax v14.4s, v14.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v0.4s\n" + "fmax v9.4s, v9.4s, v0.4s\n" + "fmax v10.4s, v10.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + "fmax v16.4s, v16.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v0.4s\n" + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v0.4s\n" + "fmax v25.4s, v25.4s, v0.4s\n" + "fmax v26.4s, v26.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v0.4s\n" + "175:" // Height 5: No activation + "cmp x11, #0x10\n" + "bge 184f\n" + "tbz x11, #3, 179f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "st1 { v25.4s }, [x23], #0x10\n" + "tbz x11, #2, 177f\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "st1 { v26.4s }, [x23], #0x10\n" + "tbz x11, #1, 176f\n" + "str d14, [x9], #0x8\n" + "str d11, [x26], #0x8\n" + "str d22, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "st1 { v27.s }[2], [x23]\n" + "b 183f\n" + "176:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 183f\n" + "str s14, [x9, #0x0]\n" + "str s11, [x26, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "str s27, [x23, #0x0]\n" + "b 183f\n" + "177:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 178f\n" + "str d13, [x9], #0x8\n" + "str d10, [x26], #0x8\n" + "str d21, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "st1 { v26.s }[2], [x23]\n" + "b 183f\n" + "178:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 183f\n" + "str s13, [x9, #0x0]\n" + "str s10, [x26, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "str s26, [x23, #0x0]\n" + "b 183f\n" + "179:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 181f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v24.4s }, [x23], #0x10\n" + "tbz x11, #1, 180f\n" + "str d12, [x9], #0x8\n" + "str d9, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v25.s }[2], [x23]\n" + "b 183f\n" + "180:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 183f\n" + "str s12, [x9, #0x0]\n" + "str s9, [x26, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "str s25, [x23, #0x0]\n" + "b 183f\n" + "181:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 182f\n" + "str d7, [x9], #0x8\n" + "str d8, [x26], #0x8\n" + "str d15, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x11, #0, 183f\n" + "st1 { v7.s }[2], [x9]\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "st1 { v24.s }[2], [x23]\n" + "b 183f\n" + "182:" // Height 5: Partial direct writeback: partial_1_0 + "str s7, [x9, #0x0]\n" + "str s8, [x26, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "str s24, [x23, #0x0]\n" + "183:" // Height 5: Partial direct writeback: Done + "b 185f\n" + "184:" // Height 5: Full writeback + "str q7, [x9, #0x0]\n" + "str q12, [x9, #0x10]\n" + "str q13, [x9, #0x20]\n" + "str q14, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q15, [x25, #0x0]\n" + "str q20, [x25, #0x10]\n" + "str q21, [x25, #0x20]\n" + "str q22, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "185:" // Height 5: Writeback done + "subs x11, x11, #0x10\n" + "bgt 150b\n" + "b 224f\n" + "186:" // Height 6 + "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "mov x20, #0x18\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "madd x20, x21, x20, x9\n" + "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "187:" // Height 6: Column loop + "cbz x10, 188f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "zip2 v12.2d, v8.2d, v8.2d\n" + "zip1 v8.2d, v8.2d, v8.2d\n" + "zip2 v13.2d, v9.2d, v9.2d\n" + "zip1 v9.2d, v9.2d, v9.2d\n" + "zip2 v14.2d, v10.2d, v10.2d\n" + "zip1 v10.2d, v10.2d, v10.2d\n" + "zip2 v15.2d, v11.2d, v11.2d\n" + "zip1 v11.2d, v11.2d, v11.2d\n" + "mov v16.16b, v8.16b\n" + "mov v20.16b, v12.16b\n" + "mov v17.16b, v9.16b\n" + "mov v21.16b, v13.16b\n" + "mov v18.16b, v10.16b\n" + "mov v22.16b, v14.16b\n" + "mov v19.16b, v11.16b\n" + "mov v23.16b, v15.16b\n" + "mov v24.16b, v8.16b\n" + "mov v28.16b, v12.16b\n" + "mov v25.16b, v9.16b\n" + "mov v29.16b, v13.16b\n" + "mov v26.16b, v10.16b\n" + "mov v30.16b, v14.16b\n" + "mov v27.16b, v11.16b\n" + "mov v31.16b, v15.16b\n" + "b 200f\n" + "188:" // Height 6: no bias + "tbz %x[flags], #0, 199f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x10\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "add x23, x24, x20, LSL #2\n" + "add x22, x23, x20, LSL #2\n" + "bge 197f\n" + "tbz x11, #3, 192f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x22], #0x10\n" + "ld1 { v10.4s }, [x9], #0x10\n" + "ld1 { v13.4s }, [x26], #0x10\n" + "ld1 { v18.4s }, [x25], #0x10\n" + "ld1 { v21.4s }, [x24], #0x10\n" + "ld1 { v26.4s }, [x23], #0x10\n" + "ld1 { v29.4s }, [x22], #0x10\n" + "tbz x11, #2, 190f\n" + "ld1 { v11.4s }, [x9], #0x10\n" + "ld1 { v14.4s }, [x26], #0x10\n" + "ld1 { v19.4s }, [x25], #0x10\n" + "ld1 { v22.4s }, [x24], #0x10\n" + "ld1 { v27.4s }, [x23], #0x10\n" + "ld1 { v30.4s }, [x22], #0x10\n" + "tbz x11, #1, 189f\n" + "ldr d16, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "mov x20, #0x38\n" + "ldr d24, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d6, [x23], #0x8\n" + "ldr d31, [x22], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v16.s }[2], [x9]\n" + "ld1 { v15.s }[2], [x26]\n" + "ld1 { v24.s }[2], [x25]\n" + "ld1 { v23.s }[2], [x24]\n" + "ld1 { v6.s }[2], [x23]\n" + "ld1 { v31.s }[2], [x22]\n" + "b 196f\n" + "189:" // Height 6: Partial accumulate: partial_1_12 + "mov x20, #0x30\n" + "tbz x11, #0, 196f\n" + "ldr s16, [x9, #0x0]\n" + "ldr s15, [x26, #0x0]\n" + "ldr s24, [x25, #0x0]\n" + "ldr s23, [x24, #0x0]\n" + "ldr s6, [x23, #0x0]\n" + "ldr s31, [x22, #0x0]\n" + "b 196f\n" + "190:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 191f\n" + "ldr d11, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "mov x20, #0x28\n" + "ldr d19, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d30, [x22], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v11.s }[2], [x9]\n" + "ld1 { v14.s }[2], [x26]\n" + "ld1 { v19.s }[2], [x25]\n" + "ld1 { v22.s }[2], [x24]\n" + "ld1 { v27.s }[2], [x23]\n" + "ld1 { v30.s }[2], [x22]\n" + "b 196f\n" + "191:" // Height 6: Partial accumulate: partial_1_8 + "mov x20, #0x20\n" + "tbz x11, #0, 196f\n" + "ldr s11, [x9, #0x0]\n" + "ldr s14, [x26, #0x0]\n" + "ldr s19, [x25, #0x0]\n" + "ldr s22, [x24, #0x0]\n" + "ldr s27, [x23, #0x0]\n" + "ldr s30, [x22, #0x0]\n" + "b 196f\n" + "192:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 194f\n" + "ld1 { v9.4s }, [x9], #0x10\n" + "ld1 { v12.4s }, [x26], #0x10\n" + "ld1 { v17.4s }, [x25], #0x10\n" + "ld1 { v20.4s }, [x24], #0x10\n" + "ld1 { v25.4s }, [x23], #0x10\n" + "ld1 { v28.4s }, [x22], #0x10\n" + "tbz x11, #1, 193f\n" + "ldr d10, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "mov x20, #0x18\n" + "ldr d18, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d29, [x22], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v10.s }[2], [x9]\n" + "ld1 { v13.s }[2], [x26]\n" + "ld1 { v18.s }[2], [x25]\n" + "ld1 { v21.s }[2], [x24]\n" + "ld1 { v26.s }[2], [x23]\n" + "ld1 { v29.s }[2], [x22]\n" + "b 196f\n" + "193:" // Height 6: Partial accumulate: partial_1_4 + "mov x20, #0x10\n" + "tbz x11, #0, 196f\n" + "ldr s10, [x9, #0x0]\n" + "ldr s13, [x26, #0x0]\n" + "ldr s18, [x25, #0x0]\n" + "ldr s21, [x24, #0x0]\n" + "ldr s26, [x23, #0x0]\n" + "ldr s29, [x22, #0x0]\n" + "b 196f\n" + "194:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 195f\n" + "ldr d9, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "mov x20, #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d28, [x22], #0x8\n" + "tbz x11, #0, 196f\n" + "ld1 { v9.s }[2], [x9]\n" + "ld1 { v12.s }[2], [x26]\n" + "ld1 { v17.s }[2], [x25]\n" + "ld1 { v20.s }[2], [x24]\n" + "ld1 { v25.s }[2], [x23]\n" + "ld1 { v28.s }[2], [x22]\n" + "b 196f\n" + "195:" // Height 6: Partial accumulate: partial_1_0 + "ldr s9, [x9, #0x0]\n" + "ldr s12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr s17, [x25, #0x0]\n" + "ldr s20, [x24, #0x0]\n" + "ldr s25, [x23, #0x0]\n" + "ldr s28, [x22, #0x0]\n" + "196:" // Height 6: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 198f\n" + "197:" // Height 6: full accumulate + "ldr q9, [x9, #0x0]\n" + "ldr q10, [x9, #0x10]\n" + "ldr q11, [x9, #0x20]\n" + "ldr q16, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q17, [x25, #0x0]\n" + "ldr q18, [x25, #0x10]\n" + "ldr q19, [x25, #0x20]\n" + "ldr q24, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "ldr q25, [x23, #0x0]\n" + "ldr q26, [x23, #0x10]\n" + "ldr q27, [x23, #0x20]\n" + "ldr q6, [x23, #0x30]\n" + "ldr q28, [x22, #0x0]\n" + "ldr q29, [x22, #0x10]\n" + "ldr q30, [x22, #0x20]\n" + "ldr q31, [x22, #0x30]\n" + "198:" // Height 6: MMLA fixup + "zip1 v8.2d, v9.2d, v12.2d\n" + "zip2 v12.2d, v9.2d, v12.2d\n" + "zip1 v9.2d, v10.2d, v13.2d\n" + "zip2 v13.2d, v10.2d, v13.2d\n" + "zip1 v10.2d, v11.2d, v14.2d\n" + "zip2 v14.2d, v11.2d, v14.2d\n" + "zip1 v11.2d, v16.2d, v15.2d\n" + "zip2 v15.2d, v16.2d, v15.2d\n" + "zip1 v16.2d, v17.2d, v20.2d\n" + "zip2 v20.2d, v17.2d, v20.2d\n" + "zip1 v17.2d, v18.2d, v21.2d\n" + "zip2 v21.2d, v18.2d, v21.2d\n" + "zip1 v18.2d, v19.2d, v22.2d\n" + "zip2 v22.2d, v19.2d, v22.2d\n" + "zip1 v19.2d, v24.2d, v23.2d\n" + "zip2 v23.2d, v24.2d, v23.2d\n" + "zip1 v24.2d, v25.2d, v28.2d\n" + "zip2 v28.2d, v25.2d, v28.2d\n" + "zip1 v25.2d, v26.2d, v29.2d\n" + "zip2 v29.2d, v26.2d, v29.2d\n" + "zip1 v26.2d, v27.2d, v30.2d\n" + "zip2 v30.2d, v27.2d, v30.2d\n" + "zip1 v27.2d, v6.2d, v31.2d\n" + "zip2 v31.2d, v6.2d, v31.2d\n" + "b 200f\n" + "199:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "200:" // Height 6: setup done + "mov x28, #0x0\n" + "201:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 202f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" + "cbnz x28, 203f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "add x21, x21, x20, LSL #1\n" + "b 203f\n" + "202:" // Height 6: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" + "203:" // Height 6: input setup done + "cmp x27, #0x8\n" + "blt 206f\n" + "ldr q1, [x26, #0x0]\n" + "ldr q2, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q4, [x23, #0x0]\n" + "ldr q5, [x22, #0x0]\n" + "ldr q6, [x21, #0x0]\n" + "ldr q7, [x10, #0x0]\n" + "blt 205f\n" + "204:" // Height 6: Multiply loop: Main loop head + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x23, x23, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + "ldr q2, [x25, #0x0]\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + "ldr q4, [x23, #0x0]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0x0]\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + "ldr q1, [x26, #0x0]\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + "ldr q3, [x24, #0x0]\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "ldr q5, [x22, #0x0]\n" + "ldr q6, [x21, #0x0]\n" + "bge 204b\n" + "205:" // Height 6: Multiply loop: Single iteration only + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn2 v1.2d, v1.2d, v2.2d\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn2 v3.2d, v3.2d, v4.2d\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "trn2 v5.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + "add x22, x22, #0x10\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + "add x21, x21, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + "prfm pldl1keep, [x24, #0x80]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x23, #0x80]\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + "prfm pldl1keep, [x22, #0x80]\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + "prfm pldl1keep, [x21, #0x80]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x80]\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x90]\n" + ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xa0]\n" + ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xb0]\n" + ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xc0]\n" + ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xd0]\n" + ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" + "ldr q7, [x10, #0xe0]\n" + ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" + "ldr q6, [x10, #0xf0]\n" + "add x10, x10, #0x100\n" + ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" + ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" + ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" + ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" + ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" + ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" + "206:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 211f\n" + "cmp x27, #0x4\n" + "blt 208f\n" + "207:" // Height 6: Multiply loop: Odd block loop + "ldr d1, [x26], #0x8\n" + "ldr d2, [x25], #0x8\n" + "sub x27, x27, #0x4\n" + "ldr d3, [x24], #0x8\n" + "ldr d4, [x23], #0x8\n" + "cmp x27, #0x4\n" + "ldr d5, [x22], #0x8\n" + "ldr d7, [x21], #0x8\n" + "ldr q6, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v7.2d\n" + "ldr q7, [x10, #0x10]\n" + ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x20]\n" + ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x30]\n" + ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x40]\n" + ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x50]\n" + ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x60]\n" + ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" + ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" + "bge 207b\n" + "208:" // Height 6: Multiply loop: Skip odd blocks + "cbz x27, 211f\n" + "tbz x27, #1, 209f\n" + "ldr s1, [x26], #0x4\n" + "ldr s2, [x25], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s4, [x23], #0x4\n" + "ldr s5, [x22], #0x4\n" + "ldr s6, [x21], #0x4\n" + "tbz x27, #0, 210f\n" + "ld1 { v1.h }[2], [x26]\n" + "ld1 { v2.h }[2], [x25]\n" + "ld1 { v3.h }[2], [x24]\n" + "ld1 { v4.h }[2], [x23]\n" + "ld1 { v5.h }[2], [x22]\n" + "ld1 { v6.h }[2], [x21]\n" + "b 210f\n" + "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 + "ldr h1, [x26, #0x0]\n" + "ldr h2, [x25, #0x0]\n" + "ldr h3, [x24, #0x0]\n" + "ldr h4, [x23, #0x0]\n" + "ldr h5, [x22, #0x0]\n" + "ldr h6, [x21, #0x0]\n" + "210:" // Height 6: Multiply loop: Ragged operand read: Done + "ldr q7, [x10, #0x0]\n" + "trn1 v0.2d, v1.2d, v2.2d\n" + "trn1 v2.2d, v3.2d, v4.2d\n" + "trn1 v4.2d, v5.2d, v6.2d\n" + "ldr q6, [x10, #0x10]\n" + ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x20]\n" + ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x30]\n" + ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x40]\n" + ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x50]\n" + ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" + "ldr q7, [x10, #0x60]\n" + ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" + "ldr q6, [x10, #0x70]\n" + "add x10, x10, #0x80\n" + ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" + ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" + ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" + ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" + ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" + ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" + "211:" // Height 6: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 201b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "uzp1 v7.2d, v8.2d, v12.2d\n" + "uzp2 v8.2d, v8.2d, v12.2d\n" + "prfm pstl1keep, [x9, #0x0]\n" + "uzp1 v12.2d, v9.2d, v13.2d\n" + "uzp2 v9.2d, v9.2d, v13.2d\n" + "uzp1 v13.2d, v10.2d, v14.2d\n" + "uzp2 v10.2d, v10.2d, v14.2d\n" + "add x26, x9, x20, LSL #2\n" + "add x25, x26, x20, LSL #2\n" + "add x24, x25, x20, LSL #2\n" + "uzp1 v14.2d, v11.2d, v15.2d\n" + "uzp2 v11.2d, v11.2d, v15.2d\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x23, x24, x20, LSL #2\n" + "uzp1 v15.2d, v16.2d, v20.2d\n" + "uzp2 v16.2d, v16.2d, v20.2d\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x22, x23, x20, LSL #2\n" + "uzp1 v20.2d, v17.2d, v21.2d\n" + "uzp2 v17.2d, v17.2d, v21.2d\n" + "prfm pstl1keep, [x24, #0x0]\n" + "uzp1 v21.2d, v18.2d, v22.2d\n" + "uzp2 v18.2d, v18.2d, v22.2d\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "uzp1 v22.2d, v19.2d, v23.2d\n" + "uzp2 v19.2d, v19.2d, v23.2d\n" + "uzp1 v23.2d, v24.2d, v28.2d\n" + "uzp2 v24.2d, v24.2d, v28.2d\n" + "uzp1 v28.2d, v25.2d, v29.2d\n" + "uzp2 v25.2d, v25.2d, v29.2d\n" + "uzp1 v29.2d, v26.2d, v30.2d\n" + "uzp2 v26.2d, v26.2d, v30.2d\n" + "uzp1 v30.2d, v27.2d, v31.2d\n" + "uzp2 v27.2d, v27.2d, v31.2d\n" + "tbz %x[flags], #1, 212f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.4s }, [x21]\n" + "ld1r { v0.4s }, [x20]\n" + "fmin v7.4s, v7.4s, v1.4s\n" + "fmin v12.4s, v12.4s, v1.4s\n" + "fmin v13.4s, v13.4s, v1.4s\n" + "fmin v14.4s, v14.4s, v1.4s\n" + "fmin v8.4s, v8.4s, v1.4s\n" + "fmin v9.4s, v9.4s, v1.4s\n" + "fmin v10.4s, v10.4s, v1.4s\n" + "fmin v11.4s, v11.4s, v1.4s\n" + "fmin v15.4s, v15.4s, v1.4s\n" + "fmin v20.4s, v20.4s, v1.4s\n" + "fmin v21.4s, v21.4s, v1.4s\n" + "fmin v22.4s, v22.4s, v1.4s\n" + "fmin v16.4s, v16.4s, v1.4s\n" + "fmin v17.4s, v17.4s, v1.4s\n" + "fmin v18.4s, v18.4s, v1.4s\n" + "fmin v19.4s, v19.4s, v1.4s\n" + "fmin v23.4s, v23.4s, v1.4s\n" + "fmin v28.4s, v28.4s, v1.4s\n" + "fmin v29.4s, v29.4s, v1.4s\n" + "fmin v30.4s, v30.4s, v1.4s\n" + "fmin v24.4s, v24.4s, v1.4s\n" + "fmin v25.4s, v25.4s, v1.4s\n" + "fmin v26.4s, v26.4s, v1.4s\n" + "fmin v27.4s, v27.4s, v1.4s\n" + "fmax v7.4s, v7.4s, v0.4s\n" + "fmax v12.4s, v12.4s, v0.4s\n" + "fmax v13.4s, v13.4s, v0.4s\n" + "fmax v14.4s, v14.4s, v0.4s\n" + "fmax v8.4s, v8.4s, v0.4s\n" + "fmax v9.4s, v9.4s, v0.4s\n" + "fmax v10.4s, v10.4s, v0.4s\n" + "fmax v11.4s, v11.4s, v0.4s\n" + "fmax v15.4s, v15.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + "fmax v16.4s, v16.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v0.4s\n" + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v23.4s, v23.4s, v0.4s\n" + "fmax v28.4s, v28.4s, v0.4s\n" + "fmax v29.4s, v29.4s, v0.4s\n" + "fmax v30.4s, v30.4s, v0.4s\n" + "fmax v24.4s, v24.4s, v0.4s\n" + "fmax v25.4s, v25.4s, v0.4s\n" + "fmax v26.4s, v26.4s, v0.4s\n" + "fmax v27.4s, v27.4s, v0.4s\n" + "212:" // Height 6: No activation + "cmp x11, #0x10\n" + "bge 221f\n" + "tbz x11, #3, 216f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v12.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v9.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v20.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v17.4s }, [x24], #0x10\n" + "st1 { v23.4s }, [x23], #0x10\n" + "st1 { v28.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "st1 { v25.4s }, [x22], #0x10\n" + "tbz x11, #2, 214f\n" + "st1 { v13.4s }, [x9], #0x10\n" + "st1 { v10.4s }, [x26], #0x10\n" + "st1 { v21.4s }, [x25], #0x10\n" + "st1 { v18.4s }, [x24], #0x10\n" + "st1 { v29.4s }, [x23], #0x10\n" + "st1 { v26.4s }, [x22], #0x10\n" + "tbz x11, #1, 213f\n" + "str d14, [x9], #0x8\n" + "str d11, [x26], #0x8\n" + "str d22, [x25], #0x8\n" + "str d19, [x24], #0x8\n" + "str d30, [x23], #0x8\n" + "str d27, [x22], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v14.s }[2], [x9]\n" + "st1 { v11.s }[2], [x26]\n" + "st1 { v22.s }[2], [x25]\n" + "st1 { v19.s }[2], [x24]\n" + "st1 { v30.s }[2], [x23]\n" + "st1 { v27.s }[2], [x22]\n" + "b 220f\n" + "213:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 220f\n" + "str s14, [x9, #0x0]\n" + "str s11, [x26, #0x0]\n" + "str s22, [x25, #0x0]\n" + "str s19, [x24, #0x0]\n" + "str s30, [x23, #0x0]\n" + "str s27, [x22, #0x0]\n" + "b 220f\n" + "214:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 215f\n" + "str d13, [x9], #0x8\n" + "str d10, [x26], #0x8\n" + "str d21, [x25], #0x8\n" + "str d18, [x24], #0x8\n" + "str d29, [x23], #0x8\n" + "str d26, [x22], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v13.s }[2], [x9]\n" + "st1 { v10.s }[2], [x26]\n" + "st1 { v21.s }[2], [x25]\n" + "st1 { v18.s }[2], [x24]\n" + "st1 { v29.s }[2], [x23]\n" + "st1 { v26.s }[2], [x22]\n" + "b 220f\n" + "215:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 220f\n" + "str s13, [x9, #0x0]\n" + "str s10, [x26, #0x0]\n" + "str s21, [x25, #0x0]\n" + "str s18, [x24, #0x0]\n" + "str s29, [x23, #0x0]\n" + "str s26, [x22, #0x0]\n" + "b 220f\n" + "216:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 218f\n" + "st1 { v7.4s }, [x9], #0x10\n" + "st1 { v8.4s }, [x26], #0x10\n" + "st1 { v15.4s }, [x25], #0x10\n" + "st1 { v16.4s }, [x24], #0x10\n" + "st1 { v23.4s }, [x23], #0x10\n" + "st1 { v24.4s }, [x22], #0x10\n" + "tbz x11, #1, 217f\n" + "str d12, [x9], #0x8\n" + "str d9, [x26], #0x8\n" + "str d20, [x25], #0x8\n" + "str d17, [x24], #0x8\n" + "str d28, [x23], #0x8\n" + "str d25, [x22], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v12.s }[2], [x9]\n" + "st1 { v9.s }[2], [x26]\n" + "st1 { v20.s }[2], [x25]\n" + "st1 { v17.s }[2], [x24]\n" + "st1 { v28.s }[2], [x23]\n" + "st1 { v25.s }[2], [x22]\n" + "b 220f\n" + "217:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 220f\n" + "str s12, [x9, #0x0]\n" + "str s9, [x26, #0x0]\n" + "str s20, [x25, #0x0]\n" + "str s17, [x24, #0x0]\n" + "str s28, [x23, #0x0]\n" + "str s25, [x22, #0x0]\n" + "b 220f\n" + "218:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 219f\n" + "str d7, [x9], #0x8\n" + "str d8, [x26], #0x8\n" + "str d15, [x25], #0x8\n" + "str d16, [x24], #0x8\n" + "str d23, [x23], #0x8\n" + "str d24, [x22], #0x8\n" + "tbz x11, #0, 220f\n" + "st1 { v7.s }[2], [x9]\n" + "st1 { v8.s }[2], [x26]\n" + "st1 { v15.s }[2], [x25]\n" + "st1 { v16.s }[2], [x24]\n" + "st1 { v23.s }[2], [x23]\n" + "st1 { v24.s }[2], [x22]\n" + "b 220f\n" + "219:" // Height 6: Partial direct writeback: partial_1_0 + "str s7, [x9, #0x0]\n" + "str s8, [x26, #0x0]\n" + "str s15, [x25, #0x0]\n" + "str s16, [x24, #0x0]\n" + "str s23, [x23, #0x0]\n" + "str s24, [x22, #0x0]\n" + "220:" // Height 6: Partial direct writeback: Done + "b 222f\n" + "221:" // Height 6: Full writeback + "str q7, [x9, #0x0]\n" + "str q12, [x9, #0x10]\n" + "str q13, [x9, #0x20]\n" + "str q14, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q8, [x26, #0x0]\n" + "str q9, [x26, #0x10]\n" + "str q10, [x26, #0x20]\n" + "str q11, [x26, #0x30]\n" + "str q15, [x25, #0x0]\n" + "str q20, [x25, #0x10]\n" + "str q21, [x25, #0x20]\n" + "str q22, [x25, #0x30]\n" + "str q16, [x24, #0x0]\n" + "str q17, [x24, #0x10]\n" + "str q18, [x24, #0x20]\n" + "str q19, [x24, #0x30]\n" + "str q23, [x23, #0x0]\n" + "str q28, [x23, #0x10]\n" + "str q29, [x23, #0x20]\n" + "str q30, [x23, #0x30]\n" + "str q24, [x22, #0x0]\n" + "str q25, [x22, #0x10]\n" + "str q26, [x22, #0x20]\n" + "str q27, [x22, #0x30]\n" + "222:" // Height 6: Writeback done + "subs x11, x11, #0x10\n" + "bgt 187b\n" + "subs %x[m], %x[m], #0x6\n" + "beq 224f\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 223f\n" + "add x21, x21, #0x6\n" + "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "223:" // Update direct input + "mov x20, #0xc\n" + "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" + "b 1b\n" + "224:" // Exit + : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) + : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), + [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), + [offsetof_N] "I"(offsetof(KernelArgs, N)), + [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), + [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), + [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), + [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), + [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), + [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} diff --git a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h new file mode 100644 index 00000000..a44d89b9 --- /dev/null +++ b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h @@ -0,0 +1,111 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets `m_step` value. +/// +/// The starting row index must be divisible by `m_step`. +/// +/// @param m Total number of row. +/// +/// @return `m_step` value. +size_t kai_get_m_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m); + +/// Gets `n_step` value. +/// +/// The starting column index must be divisible by `n_step`. +/// +/// @param n Total number of column +/// +/// @return `n_step` value. +size_t kai_get_n_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n); + +/// Gets the default row stride in bytes of the LHS matrix. +/// +/// @param[in] k Number of columns. +/// +/// @return The default row stride in bytes of the LHS matrix. +size_t kai_get_lhs_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t k); + +/// Gets the offset in bytes to the data element in the LHS matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] k_idx Column index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m_idx, size_t k_idx, size_t stride); + +/// Gets the offset in bytes to the data element in the packed RHS matrix buffer. +/// +/// @param[in] k Number of columns. +/// @param[in] n_idx Row index. +/// @param[in] k_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t k, size_t n_idx, size_t k_idx); + +/// Gets the default row stride in bytes of the destination matrix. +/// +/// @param[in] n Number of columns. +/// +/// @return The default row stride in bytes of the destination matrix. +size_t kai_get_dst_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n); + +/// Gets the offset in bytes to the data element in the destination matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] n_idx Column index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m_idx, size_t n_idx, size_t stride); + +/// Gets the size in bytes of the destination matrix buffer. +/// +/// @param[in] m Number of rows. +/// @param[in] n Number of columns. +/// @param[in] stride Row stride in bytes. +/// +/// @return The size in bytes of the destination matrix buffer. +size_t kai_get_dst_size_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m, size_t n, size_t stride); + +/// Runs the matrix multiplication microkernel followed by a clamp operation. +/// +/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset +/// calculated using the following functions: +/// +/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. +/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. +/// * Output: @ref kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. +/// +/// @param[in] m Number of output rows to be computed. +/// @param[in] n Number of output columns to be computed. +/// @param[in] k Common dimension of the LHS and RHS operand. +/// @param[in] lhs LHS matrix buffer. +/// @param[in] packed_rhs Packed RHS buffer. +/// @param[in] dst Output matrix buffer. +/// @param[in] lhs_stride Row stride in bytes of the LHS matrix. +/// @param[in] dst_stride Row stride in bytes of the output matrix. +/// @param[in] clamp_min Minimum value to clamp the final result. +/// @param[in] clamp_max Maximum value to clamp the final result. +void kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( + size_t m, size_t n, size_t k, // + const void* lhs, const void* packed_rhs, void* dst, // + size_t lhs_stride, size_t dst_stride, // + float clamp_min, float clamp_max); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c b/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c new file mode 100644 index 00000000..bc069c1f --- /dev/null +++ b/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c @@ -0,0 +1,220 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "kai_common.h" + +static const size_t block_height = 16; + +size_t kai_get_n_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n) { + KAI_UNUSED(n); + return 16; +} + +size_t kai_get_k_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t k) { + return k; +} + +size_t kai_get_rhs_default_stride_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n) { + return n * sizeof(uint16_t); +} + +size_t kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t n_idx, size_t k_idx, size_t stride) { + KAI_ASSUME(n_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + KAI_UNUSED(stride); + + return n_idx * sizeof(uint16_t); +} + +size_t kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t k, size_t n_idx, size_t k_idx) { + KAI_ASSUME(n_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + + return n_idx / block_height * (block_height * sizeof(uint16_t) + block_height * k * sizeof(uint16_t)); +} + +size_t kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n, size_t k) { + return kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + k, kai_round_up_multiple_usize(n, block_height), 0); +} + +void kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t n, size_t k, // + const void* rhs, const void* bias, void* packed_rhs, // + size_t rhs_stride) { + size_t height = k; + const size_t width = n; + const void* in = rhs; + void* out = packed_rhs; + const size_t in_stride = rhs_stride; + size_t out_stride = block_height * height * sizeof(uint16_t) + block_height * sizeof(uint16_t); + + __asm__ __volatile__( + "mov x22, %x[width]\n" + "mov x21, %x[out]\n" + "cmp x22, #0x10\n" + "blt 2f\n" + "1:" // Bias: Full row loop body + "ldr q16, [%x[bias], #0x0]\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" + "str q16, [x21, #0x0]\n" + "ldr q16, [%x[bias], #0x10]\n" + "add %x[bias], %x[bias], #0x20\n" + "str q16, [x21, #0x10]\n" + "add x21, x21, %x[out_stride]\n" + "bge 1b\n" + "2:" // Bias: Tail row loop start + "cbz x22, 4f\n" + "3:" // Bias: Tail row loop body + "ldr h20, [%x[bias], #0x0]\n" + "sub x22, x22, #0x1\n" + "add %x[bias], %x[bias], #0x2\n" + "str h20, [x21]\n" + "add x21, x21, #0x2\n" + "cbnz x22, 3b\n" + "4:" // Bias: Done + "cmp %x[height], #0x4\n" + "add %x[out], %x[out], #0x20\n" + "blt 13f\n" + "5:" // Main row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[width]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "add x22, x25, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "cmp x24, #0x10\n" + "add %x[in], x20, %x[in_stride]\n" + "blt 7f\n" + "6:" // Main row loop: Column loop + "ldr q23, [x25], #0x10\n" + "ldr q22, [x22], #0x10\n" + "sub x24, x24, #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "cmp x24, #0x10\n" + "ldr q19, [x25], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q23, [x23, #0x0]\n" + "str q19, [x23, #0x10]\n" + "str q22, [x23, #0x20]\n" + "str q18, [x23, #0x30]\n" + "str q21, [x23, #0x40]\n" + "str q17, [x23, #0x50]\n" + "str q20, [x23, #0x60]\n" + "str q16, [x23, #0x70]\n" + "add x23, x23, %x[out_stride]\n" + "bge 6b\n" + "7:" // Main row loop: Column loop skip + "cbz x24, 12f\n" + "cmp x24, #0x4\n" + "movi v16.8h, #0x0\n" + "str q16, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "str q16, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "str q16, [x23, #0x60]\n" + "str q16, [x23, #0x70]\n" + "blt 9f\n" + "8:" // Main row loop: width 4 loop: loop + "ldr d19, [x25], #0x8\n" + "ldr d18, [x22], #0x8\n" + "sub x24, x24, #0x4\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "cmp x24, #0x4\n" + "str d19, [x23, #0x0]\n" + "str d18, [x23, #0x20]\n" + "str d17, [x23, #0x40]\n" + "str d16, [x23, #0x60]\n" + "add x23, x23, #0x8\n" + "bge 8b\n" + "9:" // Main row loop: width 4 loop: skip + "cmp x24, #0x1\n" + "blt 11f\n" + "10:" // Main row loop: width 1 loop: loop + "ldr h19, [x25], #0x2\n" + "ldr h18, [x22], #0x2\n" + "sub x24, x24, #0x1\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "cmp x24, #0x1\n" + "str h19, [x23, #0x0]\n" + "str h18, [x23, #0x20]\n" + "str h17, [x23, #0x40]\n" + "str h16, [x23, #0x60]\n" + "add x23, x23, #0x2\n" + "bge 10b\n" + "11:" // Main row loop: width 1 loop: skip + "12:" // Main row loop: odd col skip + "cmp %x[height], #0x4\n" + "add %x[out], %x[out], #0x80\n" + "bge 5b\n" + "cbz %x[height], 22f\n" + "13:" // Main loop skip + "14:" // Tail row loop: Head + "mov x20, %x[width]\n" + "mov x25, %x[in]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x1\n" + "cmp x20, #0x10\n" + "add %x[in], x25, %x[in_stride]\n" + "blt 16f\n" + "15:" // Tail row loop: Column loop + "ldr q17, [x25], #0x10\n" + "sub x20, x20, #0x10\n" + "ldr q16, [x25], #0x10\n" + "cmp x20, #0x10\n" + "str q17, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "add x23, x23, %x[out_stride]\n" + "bge 15b\n" + "16:" // Tail row loop: Column loop skip + "cbz x20, 21f\n" + "cmp x20, #0x4\n" + "movi v16.8h, #0x0\n" + "str q16, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "blt 18f\n" + "17:" // Tail row loop: width 4 loop: loop + "ldr d16, [x25], #0x8\n" + "sub x20, x20, #0x4\n" + "cmp x20, #0x4\n" + "str d16, [x23, #0x0]\n" + "add x23, x23, #0x8\n" + "bge 17b\n" + "18:" // Tail row loop: width 4 loop: skip + "cmp x20, #0x1\n" + "blt 20f\n" + "19:" // Tail row loop: width 1 loop: loop + "ldr h16, [x25], #0x2\n" + "sub x20, x20, #0x1\n" + "cmp x20, #0x1\n" + "str h16, [x23, #0x0]\n" + "add x23, x23, #0x2\n" + "bge 19b\n" + "20:" // Tail row loop: width 1 loop: skip + "21:" // Tail row loop: odd col skip + "cmp %x[height], #0x1\n" + "add %x[out], %x[out], #0x20\n" + "bge 14b\n" + "22:" // Done + : [bias] "+&r"(bias), [height] "+&r"(height), [in] "+&r"(in), [out] "+&r"(out) + : [in_stride] "r"(in_stride), [out_stride] "r"(out_stride), [width] "r"(width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", + "x25"); +} diff --git a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h b/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h new file mode 100644 index 00000000..117d69d8 --- /dev/null +++ b/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h @@ -0,0 +1,90 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets `n_step` value. +/// +/// The starting row index must be divisible by `n_step`. +/// +/// @param n Total number of row. +/// +/// @return `n_step` value. +size_t kai_get_n_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n); + +/// Gets `k_step` value. +/// +/// The starting column index must be divisible by `k_step`. +/// +/// @param k Total number of column. +/// +/// @return `k_step` value. +size_t kai_get_k_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t k); + +/// Gets the default row stride in bytes of the RHS matrix. +/// +/// @param[in] n Number of columns. +/// +/// @return The default row stride in bytes of the LHS matrix. +size_t kai_get_rhs_default_stride_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n); + +/// Gets the offset in bytes to the data element in the RHS matrix buffer. +/// +/// @param[in] n_idx Column index. +/// @param[in] k_idx Row index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t n_idx, size_t k_idx, size_t stride); + +/// Gets the offset in bytes to the data element in the packed RHS buffer. +/// +/// @param[in] k Number of columns. +/// @param[in] n_idx Row index. +/// @param[in] k_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t k, size_t n_idx, size_t k_idx); + +/// Gets the size in bytes of the packed RHS buffer. +/// +/// @param[in] n Number of rows. +/// @param[in] k Number of columns. +/// +/// @return The size in bytes of the packed RHS buffer. +size_t kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n, size_t k); + +/// Runs the matrix multiplication microkernel followed by a clamp operation. +/// +/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset +/// calculated using the following functions: +/// +/// * LHS: @ref kai_get_lhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. +/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. +/// * Output: @ref kai_get_dst_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. +/// +/// @param[in] n Number of output rows to be computed. +/// @param[in] k Number of output columns to be computed. +/// @param[in] rhs RHS matrix buffer. +/// @param[in] bias Bias matrix buffer. +/// @param[in] packed_rhs Packed RHS buffer. +/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. +void kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( + size_t n, size_t k, // + const void* rhs, const void* bias, void* packed_rhs, // + size_t rhs_stride); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c new file mode 100644 index 00000000..14f5b8b5 --- /dev/null +++ b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c @@ -0,0 +1,498 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include "kai_common.h" + +static const size_t block_height = 16; +static const size_t subblock_width = 4; + +size_t kai_get_n_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n) { + KAI_UNUSED(n); + return 16; +} + +size_t kai_get_k_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t k) { + return k; +} + +size_t kai_get_rhs_default_stride_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n) { + return n * sizeof(uint16_t); +} + +size_t kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t n_idx, size_t k_idx, size_t stride) { + KAI_ASSUME(n_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + KAI_UNUSED(stride); + + return n_idx * sizeof(uint16_t); +} + +size_t kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t k, size_t n_idx, size_t k_idx) { + KAI_ASSUME(n_idx % block_height == 0); + KAI_ASSUME(k_idx == 0); + + return n_idx / block_height * + (block_height * sizeof(uint32_t) + + block_height * kai_round_up_multiple_usize(k, subblock_width) * sizeof(uint16_t)); +} + +size_t kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n, size_t k) { + return kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + k, kai_round_up_multiple_usize(n, block_height), 0); +} + +void kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t n, size_t k, // + const void* rhs, const void* bias, void* packed_rhs, // + size_t rhs_stride) { + size_t height = k; + const size_t width = n; + const void* in = rhs; + void* out = packed_rhs; + const size_t in_stride = rhs_stride; + uint16_t* pad_row = (uint16_t*)alloca(width * sizeof(uint16_t)); + + if (height % 4) { + memset(pad_row, 0, width * sizeof(uint16_t)); + } + + size_t out_stride = + block_height * kai_round_up_multiple_usize(height, 4) * sizeof(uint16_t) + block_height * sizeof(uint32_t); + + __asm__ __volatile__( + "mov x22, %x[width]\n" + "mov x21, %x[out]\n" + "cmp x22, #0x10\n" + "blt 2f\n" + "1:" // Bias: Full row loop body + "ldr q9, [%x[bias], #0x0]\n" + "sub x22, x22, #0x10\n" + "cmp x22, #0x10\n" + "str q9, [x21, #0x0]\n" + "ldr q22, [%x[bias], #0x10]\n" + "str q22, [x21, #0x10]\n" + "ldr q30, [%x[bias], #0x20]\n" + "str q30, [x21, #0x20]\n" + "ldr q3, [%x[bias], #0x30]\n" + "add %x[bias], %x[bias], #0x40\n" + "str q3, [x21, #0x30]\n" + "add x21, x21, %x[out_stride]\n" + "bge 1b\n" + "2:" // Bias: Tail row loop start + "cbz x22, 4f\n" + "3:" // Bias: Tail row loop body + "ldr w20, [%x[bias], #0x0]\n" + "sub x22, x22, #0x1\n" + "add %x[bias], %x[bias], #0x4\n" + "str w20, [x21]\n" + "add x21, x21, #0x4\n" + "cbnz x22, 3b\n" + "4:" // Bias: Done + "cmp %x[height], #0x10\n" + "add %x[out], %x[out], #0x40\n" + "blt 13f\n" + "5:" // Main row loop: Head + "mov x17, %x[in]\n" + "mov x16, %x[width]\n" + "mov x15, %x[out]\n" + "sub %x[height], %x[height], #0x10\n" + "add x14, x17, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "cmp x16, #0x10\n" + "add x11, x12, %x[in_stride]\n" + "add x10, x11, %x[in_stride]\n" + "add x9, x10, %x[in_stride]\n" + "add x28, x9, %x[in_stride]\n" + "add x27, x28, %x[in_stride]\n" + "add x26, x27, %x[in_stride]\n" + "add x25, x26, %x[in_stride]\n" + "add x24, x25, %x[in_stride]\n" + "add x23, x24, %x[in_stride]\n" + "add x22, x23, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "add %x[in], x20, %x[in_stride]\n" + "blt 7f\n" + "6:" // Main row loop: Column loop + "ldr q4, [x17], #0x10\n" + "ldr q11, [x14], #0x10\n" + "sub x16, x16, #0x10\n" + "ldr q10, [x13], #0x10\n" + "ldr q0, [x12], #0x10\n" + "cmp x16, #0x10\n" + "ldr q14, [x11], #0x10\n" + "ldr q15, [x10], #0x10\n" + "ldr q17, [x9], #0x10\n" + "ldr q16, [x28], #0x10\n" + "ldr q2, [x27], #0x10\n" + "ldr q7, [x26], #0x10\n" + "zip1 v30.8h, v4.8h, v10.8h\n" + "zip1 v26.8h, v11.8h, v0.8h\n" + "ldr q31, [x25], #0x10\n" + "ldr q5, [x24], #0x10\n" + "zip2 v20.8h, v4.8h, v10.8h\n" + "zip2 v4.8h, v11.8h, v0.8h\n" + "ldr q18, [x23], #0x10\n" + "ldr q6, [x22], #0x10\n" + "zip1 v12.8h, v14.8h, v17.8h\n" + "zip1 v1.8h, v15.8h, v16.8h\n" + "ldr q9, [x21], #0x10\n" + "ldr q19, [x20], #0x10\n" + "zip2 v11.8h, v14.8h, v17.8h\n" + "zip2 v14.8h, v15.8h, v16.8h\n" + "ldr q22, [x17], #0x10\n" + "ldr q27, [x14], #0x10\n" + "zip1 v13.8h, v2.8h, v31.8h\n" + "zip1 v0.8h, v7.8h, v5.8h\n" + "ldr q24, [x13], #0x10\n" + "ldr q17, [x12], #0x10\n" + "zip2 v15.8h, v2.8h, v31.8h\n" + "zip2 v21.8h, v7.8h, v5.8h\n" + "ldr q31, [x11], #0x10\n" + "ldr q23, [x10], #0x10\n" + "zip1 v10.8h, v18.8h, v9.8h\n" + "zip1 v8.8h, v6.8h, v19.8h\n" + "ldr q2, [x9], #0x10\n" + "ldr q3, [x28], #0x10\n" + "zip2 v16.8h, v18.8h, v9.8h\n" + "zip2 v9.8h, v6.8h, v19.8h\n" + "ldr q25, [x27], #0x10\n" + "ldr q6, [x26], #0x10\n" + "zip1 v29.8h, v22.8h, v24.8h\n" + "zip1 v18.8h, v27.8h, v17.8h\n" + "ldr q5, [x25], #0x10\n" + "ldr q7, [x24], #0x10\n" + "zip2 v28.8h, v22.8h, v24.8h\n" + "zip2 v27.8h, v27.8h, v17.8h\n" + "ldr q19, [x23], #0x10\n" + "ldr q22, [x22], #0x10\n" + "zip1 v24.8h, v31.8h, v2.8h\n" + "zip1 v17.8h, v23.8h, v3.8h\n" + "zip2 v2.8h, v31.8h, v2.8h\n" + "ldr q31, [x21], #0x10\n" + "zip2 v23.8h, v23.8h, v3.8h\n" + "zip1 v3.8h, v25.8h, v5.8h\n" + "zip2 v5.8h, v25.8h, v5.8h\n" + "zip1 v25.8h, v6.8h, v7.8h\n" + "zip2 v7.8h, v6.8h, v7.8h\n" + "zip1 v6.8h, v19.8h, v31.8h\n" + "zip2 v19.8h, v19.8h, v31.8h\n" + "zip1 v31.8h, v30.8h, v26.8h\n" + "zip2 v26.8h, v30.8h, v26.8h\n" + "ldr q30, [x20], #0x10\n" + "str q31, [x15, #0x0]\n" + "zip1 v31.8h, v20.8h, v4.8h\n" + "zip2 v4.8h, v20.8h, v4.8h\n" + "zip1 v20.8h, v29.8h, v18.8h\n" + "zip2 v29.8h, v29.8h, v18.8h\n" + "zip1 v18.8h, v22.8h, v30.8h\n" + "zip2 v22.8h, v22.8h, v30.8h\n" + "str q26, [x15, #0x10]\n" + "zip1 v30.8h, v28.8h, v27.8h\n" + "zip2 v27.8h, v28.8h, v27.8h\n" + "str q31, [x15, #0x20]\n" + "zip1 v26.8h, v12.8h, v1.8h\n" + "zip2 v31.8h, v12.8h, v1.8h\n" + "str q4, [x15, #0x30]\n" + "zip1 v1.8h, v11.8h, v14.8h\n" + "zip2 v11.8h, v11.8h, v14.8h\n" + "str q20, [x15, #0x40]\n" + "zip1 v14.8h, v24.8h, v17.8h\n" + "zip2 v17.8h, v24.8h, v17.8h\n" + "str q29, [x15, #0x50]\n" + "zip1 v29.8h, v2.8h, v23.8h\n" + "zip2 v12.8h, v2.8h, v23.8h\n" + "str q30, [x15, #0x60]\n" + "zip1 v30.8h, v13.8h, v0.8h\n" + "zip2 v13.8h, v13.8h, v0.8h\n" + "str q27, [x15, #0x70]\n" + "zip1 v0.8h, v15.8h, v21.8h\n" + "zip2 v28.8h, v15.8h, v21.8h\n" + "str q26, [x15, #0x80]\n" + "zip1 v27.8h, v3.8h, v25.8h\n" + "zip2 v26.8h, v3.8h, v25.8h\n" + "str q31, [x15, #0x90]\n" + "zip1 v2.8h, v5.8h, v7.8h\n" + "zip2 v24.8h, v5.8h, v7.8h\n" + "str q1, [x15, #0xa0]\n" + "zip1 v23.8h, v10.8h, v8.8h\n" + "zip2 v7.8h, v10.8h, v8.8h\n" + "str q11, [x15, #0xb0]\n" + "zip1 v21.8h, v16.8h, v9.8h\n" + "zip2 v3.8h, v16.8h, v9.8h\n" + "str q14, [x15, #0xc0]\n" + "zip1 v20.8h, v6.8h, v18.8h\n" + "zip2 v18.8h, v6.8h, v18.8h\n" + "str q17, [x15, #0xd0]\n" + "zip1 v17.8h, v19.8h, v22.8h\n" + "zip2 v16.8h, v19.8h, v22.8h\n" + "str q29, [x15, #0xe0]\n" + "str q12, [x15, #0xf0]\n" + "str q30, [x15, #0x100]\n" + "str q13, [x15, #0x110]\n" + "str q0, [x15, #0x120]\n" + "str q28, [x15, #0x130]\n" + "str q27, [x15, #0x140]\n" + "str q26, [x15, #0x150]\n" + "str q2, [x15, #0x160]\n" + "str q24, [x15, #0x170]\n" + "str q23, [x15, #0x180]\n" + "str q7, [x15, #0x190]\n" + "str q21, [x15, #0x1a0]\n" + "str q3, [x15, #0x1b0]\n" + "str q20, [x15, #0x1c0]\n" + "str q18, [x15, #0x1d0]\n" + "str q17, [x15, #0x1e0]\n" + "str q16, [x15, #0x1f0]\n" + "add x15, x15, %x[out_stride]\n" + "bge 6b\n" + "7:" // Main row loop: Column loop skip + "cbz x16, 12f\n" + "cmp x16, #0x4\n" + "movi v16.8h, #0x0\n" + "str q16, [x15, #0x0]\n" + "str q16, [x15, #0x10]\n" + "str q16, [x15, #0x20]\n" + "str q16, [x15, #0x30]\n" + "str q16, [x15, #0x40]\n" + "str q16, [x15, #0x50]\n" + "str q16, [x15, #0x60]\n" + "str q16, [x15, #0x70]\n" + "str q16, [x15, #0x80]\n" + "str q16, [x15, #0x90]\n" + "str q16, [x15, #0xa0]\n" + "str q16, [x15, #0xb0]\n" + "str q16, [x15, #0xc0]\n" + "str q16, [x15, #0xd0]\n" + "str q16, [x15, #0xe0]\n" + "str q16, [x15, #0xf0]\n" + "str q16, [x15, #0x100]\n" + "str q16, [x15, #0x110]\n" + "str q16, [x15, #0x120]\n" + "str q16, [x15, #0x130]\n" + "str q16, [x15, #0x140]\n" + "str q16, [x15, #0x150]\n" + "str q16, [x15, #0x160]\n" + "str q16, [x15, #0x170]\n" + "str q16, [x15, #0x180]\n" + "str q16, [x15, #0x190]\n" + "str q16, [x15, #0x1a0]\n" + "str q16, [x15, #0x1b0]\n" + "str q16, [x15, #0x1c0]\n" + "str q16, [x15, #0x1d0]\n" + "str q16, [x15, #0x1e0]\n" + "str q16, [x15, #0x1f0]\n" + "blt 9f\n" + "8:" // Main row loop: width 4 loop: loop + "ldr d22, [x17], #0x8\n" + "ldr d21, [x14], #0x8\n" + "sub x16, x16, #0x4\n" + "ldr d19, [x13], #0x8\n" + "ldr d18, [x12], #0x8\n" + "cmp x16, #0x4\n" + "ldr d23, [x11], #0x8\n" + "ldr d20, [x10], #0x8\n" + "ldr d17, [x9], #0x8\n" + "ldr d16, [x28], #0x8\n" + "ldr d28, [x27], #0x8\n" + "ldr d27, [x26], #0x8\n" + "zip1 v22.8h, v22.8h, v19.8h\n" + "zip1 v19.8h, v21.8h, v18.8h\n" + "ldr d21, [x25], #0x8\n" + "ldr d18, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d25, [x22], #0x8\n" + "zip1 v24.8h, v23.8h, v17.8h\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "ldr d20, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "zip1 v23.8h, v22.8h, v19.8h\n" + "zip2 v19.8h, v22.8h, v19.8h\n" + "zip1 v22.8h, v28.8h, v21.8h\n" + "zip1 v18.8h, v27.8h, v18.8h\n" + "zip1 v21.8h, v24.8h, v17.8h\n" + "zip2 v17.8h, v24.8h, v17.8h\n" + "zip1 v20.8h, v26.8h, v20.8h\n" + "zip1 v16.8h, v25.8h, v16.8h\n" + "str q23, [x15, #0x0]\n" + "str q19, [x15, #0x10]\n" + "zip1 v19.8h, v22.8h, v18.8h\n" + "zip2 v18.8h, v22.8h, v18.8h\n" + "str q21, [x15, #0x80]\n" + "str q17, [x15, #0x90]\n" + "zip1 v17.8h, v20.8h, v16.8h\n" + "zip2 v16.8h, v20.8h, v16.8h\n" + "str q19, [x15, #0x100]\n" + "str q18, [x15, #0x110]\n" + "str q17, [x15, #0x180]\n" + "str q16, [x15, #0x190]\n" + "add x15, x15, #0x20\n" + "bge 8b\n" + "9:" // Main row loop: width 4 loop: skip + "cmp x16, #0x1\n" + "blt 11f\n" + "10:" // Main row loop: width 1 loop: loop + "ldr h23, [x17], #0x2\n" + "ldr h21, [x14], #0x2\n" + "sub x16, x16, #0x1\n" + "ldr h20, [x13], #0x2\n" + "ldr h19, [x12], #0x2\n" + "cmp x16, #0x1\n" + "ldr h22, [x11], #0x2\n" + "ldr h18, [x10], #0x2\n" + "ldr h17, [x9], #0x2\n" + "ldr h16, [x28], #0x2\n" + "ldr h27, [x27], #0x2\n" + "ldr h26, [x26], #0x2\n" + "zip1 v25.8h, v23.8h, v20.8h\n" + "zip1 v21.8h, v21.8h, v19.8h\n" + "ldr h20, [x25], #0x2\n" + "ldr h19, [x24], #0x2\n" + "ldr h24, [x23], #0x2\n" + "ldr h23, [x22], #0x2\n" + "zip1 v22.8h, v22.8h, v17.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "ldr h18, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "zip1 v21.8h, v25.8h, v21.8h\n" + "zip1 v20.8h, v27.8h, v20.8h\n" + "zip1 v19.8h, v26.8h, v19.8h\n" + "zip1 v17.8h, v22.8h, v17.8h\n" + "zip1 v18.8h, v24.8h, v18.8h\n" + "zip1 v16.8h, v23.8h, v16.8h\n" + "str d21, [x15, #0x0]\n" + "str d17, [x15, #0x80]\n" + "zip1 v17.8h, v20.8h, v19.8h\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "str d17, [x15, #0x100]\n" + "str d16, [x15, #0x180]\n" + "add x15, x15, #0x8\n" + "bge 10b\n" + "11:" // Main row loop: width 1 loop: skip + "12:" // Main row loop: odd col skip + "cmp %x[height], #0x10\n" + "add %x[out], %x[out], #0x200\n" + "bge 5b\n" + "cbz %x[height], 22f\n" + "13:" // Main loop skip + "14:" // Tail row loop: Head + "mov x17, %x[in]\n" + "mov x20, %x[width]\n" + "cmp %x[height], #0x3\n" + "mov x15, %x[out]\n" + "add x14, x17, %x[in_stride]\n" + "add x13, x14, %x[in_stride]\n" + "add x12, x13, %x[in_stride]\n" + "csel x13, x13, %x[pad_row], GE\n" + "add %x[in], x12, %x[in_stride]\n" + "csel x12, x12, %x[pad_row], GT\n" + "cmp %x[height], #0x1\n" + "sub %x[height], %x[height], #0x4\n" + "csel x14, x14, %x[pad_row], GT\n" + "cmp x20, #0x10\n" + "blt 16f\n" + "15:" // Tail row loop: Column loop + "ldr q20, [x17], #0x10\n" + "ldr q19, [x14], #0x10\n" + "sub x20, x20, #0x10\n" + "ldr q18, [x13], #0x10\n" + "ldr q17, [x12], #0x10\n" + "cmp x20, #0x10\n" + "ldr q24, [x17], #0x10\n" + "ldr q25, [x14], #0x10\n" + "ldr q23, [x13], #0x10\n" + "ldr q16, [x12], #0x10\n" + "zip1 v22.8h, v20.8h, v18.8h\n" + "zip1 v21.8h, v19.8h, v17.8h\n" + "zip2 v20.8h, v20.8h, v18.8h\n" + "zip2 v19.8h, v19.8h, v17.8h\n" + "zip1 v18.8h, v24.8h, v23.8h\n" + "zip1 v17.8h, v25.8h, v16.8h\n" + "zip2 v24.8h, v24.8h, v23.8h\n" + "zip2 v16.8h, v25.8h, v16.8h\n" + "zip1 v23.8h, v22.8h, v21.8h\n" + "zip2 v22.8h, v22.8h, v21.8h\n" + "zip1 v21.8h, v20.8h, v19.8h\n" + "zip2 v20.8h, v20.8h, v19.8h\n" + "zip1 v19.8h, v18.8h, v17.8h\n" + "zip2 v18.8h, v18.8h, v17.8h\n" + "zip1 v17.8h, v24.8h, v16.8h\n" + "zip2 v16.8h, v24.8h, v16.8h\n" + "str q23, [x15, #0x0]\n" + "str q22, [x15, #0x10]\n" + "str q21, [x15, #0x20]\n" + "str q20, [x15, #0x30]\n" + "str q19, [x15, #0x40]\n" + "str q18, [x15, #0x50]\n" + "str q17, [x15, #0x60]\n" + "str q16, [x15, #0x70]\n" + "add x15, x15, %x[out_stride]\n" + "bge 15b\n" + "16:" // Tail row loop: Column loop skip + "cbz x20, 21f\n" + "cmp x20, #0x4\n" + "movi v16.8h, #0x0\n" + "str q16, [x15, #0x0]\n" + "str q16, [x15, #0x10]\n" + "str q16, [x15, #0x20]\n" + "str q16, [x15, #0x30]\n" + "str q16, [x15, #0x40]\n" + "str q16, [x15, #0x50]\n" + "str q16, [x15, #0x60]\n" + "str q16, [x15, #0x70]\n" + "blt 18f\n" + "17:" // Tail row loop: width 4 loop: loop + "ldr d18, [x17], #0x8\n" + "ldr d19, [x14], #0x8\n" + "sub x20, x20, #0x4\n" + "ldr d17, [x13], #0x8\n" + "ldr d16, [x12], #0x8\n" + "cmp x20, #0x4\n" + "zip1 v18.8h, v18.8h, v17.8h\n" + "zip1 v16.8h, v19.8h, v16.8h\n" + "zip1 v17.8h, v18.8h, v16.8h\n" + "zip2 v16.8h, v18.8h, v16.8h\n" + "str q17, [x15, #0x0]\n" + "str q16, [x15, #0x10]\n" + "add x15, x15, #0x20\n" + "bge 17b\n" + "18:" // Tail row loop: width 4 loop: skip + "cmp x20, #0x1\n" + "blt 20f\n" + "19:" // Tail row loop: width 1 loop: loop + "ldr h19, [x17], #0x2\n" + "ldr h18, [x14], #0x2\n" + "sub x20, x20, #0x1\n" + "ldr h17, [x13], #0x2\n" + "ldr h16, [x12], #0x2\n" + "cmp x20, #0x1\n" + "zip1 v17.8h, v19.8h, v17.8h\n" + "zip1 v16.8h, v18.8h, v16.8h\n" + "zip1 v16.8h, v17.8h, v16.8h\n" + "str d16, [x15, #0x0]\n" + "add x15, x15, #0x8\n" + "bge 19b\n" + "20:" // Tail row loop: width 1 loop: skip + "21:" // Tail row loop: odd col skip + "cmp %x[height], #0x1\n" + "add %x[out], %x[out], #0x80\n" + "bge 14b\n" + "22:" // Done + : [bias] "+&r"(bias), [height] "+&r"(height), [in] "+&r"(in), [out] "+&r"(out) + : [in_stride] "r"(in_stride), [out_stride] "r"(out_stride), [pad_row] "r"(pad_row), [width] "r"(width) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", + "x25", "x26", "x27", "x28"); +} diff --git a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h new file mode 100644 index 00000000..ba45e6e7 --- /dev/null +++ b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h @@ -0,0 +1,90 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets `n_step` value. +/// +/// The starting row index must be divisible by `n_step`. +/// +/// @param n Total number of row. +/// +/// @return `n_step` value. +size_t kai_get_n_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n); + +/// Gets `k_step` value. +/// +/// The starting column index must be divisible by `k_step`. +/// +/// @param k Total number of column. +/// +/// @return `k_step` value. +size_t kai_get_k_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t k); + +/// Gets the default row stride in bytes of the RHS matrix. +/// +/// @param[in] n Number of columns. +/// +/// @return The default row stride in bytes of the LHS matrix. +size_t kai_get_rhs_default_stride_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n); + +/// Gets the offset in bytes to the data element in the RHS matrix buffer. +/// +/// @param[in] n_idx Column index. +/// @param[in] k_idx Row index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t n_idx, size_t k_idx, size_t stride); + +/// Gets the offset in bytes to the data element in the packed RHS buffer. +/// +/// @param[in] k Number of columns. +/// @param[in] n_idx Row index. +/// @param[in] k_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t k, size_t n_idx, size_t k_idx); + +/// Gets the size in bytes of the packed RHS buffer. +/// +/// @param[in] n Number of rows. +/// @param[in] k Number of columns. +/// +/// @return The size in bytes of the packed RHS buffer. +size_t kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n, size_t k); + +/// Runs the matrix multiplication microkernel followed by a clamp operation. +/// +/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset +/// calculated using the following functions: +/// +/// * LHS: @ref kai_get_lhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. +/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. +/// * Output: @ref kai_get_dst_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. +/// +/// @param[in] n Number of output rows to be computed. +/// @param[in] k Number of output columns to be computed. +/// @param[in] rhs RHS matrix buffer. +/// @param[in] bias Bias matrix buffer. +/// @param[in] packed_rhs Packed RHS buffer. +/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. +void kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( + size_t n, size_t k, // + const void* rhs, const void* bias, void* packed_rhs, // + size_t rhs_stride); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/test/common/bfloat16.cpp b/test/common/bfloat16.cpp new file mode 100644 index 00000000..d9581b2f --- /dev/null +++ b/test/common/bfloat16.cpp @@ -0,0 +1,17 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test/common/bfloat16.hpp" + +#include + +namespace kai::test { + +std::ostream& operator<<(std::ostream& os, BFloat16 value) { + return os << static_cast(value); +} + +} // namespace kai::test diff --git a/test/common/bfloat16.hpp b/test/common/bfloat16.hpp new file mode 100644 index 00000000..c8c2aac8 --- /dev/null +++ b/test/common/bfloat16.hpp @@ -0,0 +1,88 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include +#include +#include + +#include "test/common/type_traits.hpp" + +namespace kai::test { + +/// Half-precision brain floating-point. +/// +/// This class encapsulates `bfloat16_t` data type provided by `arm_bf16.h`. +class BFloat16 { +public: + /// Constructor. + BFloat16() = default; + + /// Destructor. + ~BFloat16() = default; + + /// Copy constructor. + BFloat16(const BFloat16&) = default; + + /// Copy assignment. + BFloat16& operator=(const BFloat16&) = default; + + /// Move constructor. + BFloat16(BFloat16&&) = default; + + /// Move assignment. + BFloat16& operator=(BFloat16&&) = default; + + /// Creates a new object from the `bfloat16_t` value. + constexpr explicit BFloat16(bfloat16_t value) : _data(value) { + } + + /// Creates a new object from the specified numeric value. + template , bool> = true> + constexpr explicit BFloat16(T value) : _data(static_cast(static_cast(value))) { + } + + /// Assigns to the specified numeric value which will be converted to `bfloat16_t`. + template , bool> = true> + BFloat16& operator=(T value) { + _data = static_cast(value); + return *this; + } + + /// Converts to numeric type `T`. + template , bool> = true> + explicit operator T() const { + return static_cast(_data); + } + + /// Equality operator. + bool operator==(BFloat16 rhs) const { + const auto* lhs_data = reinterpret_cast(&_data); + const auto* rhs_data = reinterpret_cast(&rhs._data); + return *lhs_data == *rhs_data; + } + + /// Unequality operator. + bool operator!=(BFloat16 rhs) const { + return !(*this == rhs); + } + + /// Writes the value to the output stream. + /// + /// @param[in] os Output stream to be written to. + /// @param[in] value Value to be written. + /// + /// @return The output stream. + friend std::ostream& operator<<(std::ostream& os, BFloat16 value); + +private: + bfloat16_t _data; +}; + +} // namespace kai::test diff --git a/test/common/compare.cpp b/test/common/compare.cpp index f1dfecdf..ec561515 100644 --- a/test/common/compare.cpp +++ b/test/common/compare.cpp @@ -10,10 +10,13 @@ #include #include #include +#include #include "src/kai_common.h" +#include "test/common/bfloat16.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/int4.hpp" #include "test/common/logging.hpp" #include "test/common/memory.hpp" @@ -69,13 +72,17 @@ bool compare_raw( return handler.success(full_height * full_width); } -/// Compares matrices with per-row quantization. +/// Compares matrices with per-row bias or per-row quantization. template bool compare_per_row( const void* imp_data, const void* ref_data, const DataFormat& format, size_t full_height, size_t full_width, const Rect& rect, MismatchHandler& handler) { - const auto block_height = format.block_height(); - const auto block_width = format.block_width(); + constexpr auto has_scale = !std::is_null_pointer_v; + + const auto block_height = format.actual_block_height(full_height); + const auto block_width = format.actual_block_width(full_width); + const auto subblock_height = format.actual_subblock_height(full_height); + const auto subblock_width = format.actual_subblock_width(full_width); KAI_ASSUME(format.scheduler_block_height(full_height) == block_height); KAI_ASSUME(format.scheduler_block_width(full_width) == full_width); @@ -84,84 +91,83 @@ bool compare_per_row( const auto data_bits = size_in_bits; - const auto num_groups = (full_height + block_height - 1) / block_height; - const auto group_num_blocks = (full_width + block_width - 1) / block_width; - - const auto group_offsets_bytes = block_height * sizeof(Offset); - const auto group_scales_bytes = block_height * sizeof(Scale); - const auto block_data_bytes = block_height * block_width * data_bits / 8; - - const auto begin_group = rect.start_row() / block_height; - const auto end_group = rect.end_row() / block_height; + const auto row_block_zero_points_bytes = block_height * sizeof(Offset); + const auto row_block_scales_bytes = has_scale ? block_height * sizeof(Scale) : 0; + const auto row_block_data_bytes = block_height * block_width * data_bits / 8; const auto* imp_ptr = reinterpret_cast(imp_data); const auto* ref_ptr = reinterpret_cast(ref_data); - for (size_t group_no = 0; group_no < num_groups; ++group_no) { - const auto in_roi = group_no >= begin_group && group_no < end_group; + for (size_t y_block = 0; y_block < full_height; y_block += block_height) { + const auto in_roi = y_block >= rect.start_row() && y_block < rect.end_row(); - // Checks the quantization offsets. + // Checks the zero points. for (size_t i = 0; i < block_height; ++i) { - const auto imp_offset = reinterpret_cast(imp_ptr)[i]; - const Offset ref_offset = in_roi ? reinterpret_cast(ref_ptr)[i] : 0; - const auto [abs_err, rel_err] = calculate_error(imp_offset, ref_offset); + const auto imp_zero_point = reinterpret_cast(imp_ptr)[i]; + const Offset ref_zero_point = in_roi ? reinterpret_cast(ref_ptr)[i] : 0; + const auto [abs_err, rel_err] = calculate_error(imp_zero_point, ref_zero_point); if (abs_err != 0 || rel_err != 0) { handler.mark_as_failed(); - const auto raw_row = group_no * block_height + i; + const auto raw_row = y_block + i; KAI_LOGE( - "Mismatched quantization offset ", raw_row, ": actual = ", imp_offset, ", expected: ", ref_offset); + "Mismatched zero point ", raw_row, ": actual = ", imp_zero_point, ", expected: ", ref_zero_point); } } - imp_ptr += group_offsets_bytes; - ref_ptr += group_offsets_bytes; + imp_ptr += row_block_zero_points_bytes; + ref_ptr += row_block_zero_points_bytes; // Checks the data. - for (size_t block_no = 0; block_no < group_num_blocks; ++block_no) { - for (size_t y = 0; y < block_height; ++y) { - for (size_t x = 0; x < block_width; ++x) { - const auto imp_data = read_array(imp_ptr, y * block_width + x); - const Data ref_data = in_roi ? read_array(ref_ptr, y * block_width + x) : Data(0); - const auto [abs_err, rel_err] = calculate_error(imp_data, ref_data); - - if (abs_err != 0 || rel_err != 0) { - const auto notifying = !in_roi || handler.handle_data(abs_err, rel_err); - - if (notifying) { - const auto raw_row = group_no * block_height + y; - const auto raw_col = block_no * block_width + x; - - KAI_LOGE( - "Mismatched data at (", raw_row, ", ", raw_col, "): actual = ", imp_data, - ", expected: ", ref_data); + for (size_t x_block = 0; x_block < full_width; x_block += block_width) { + for (size_t y_subblock = 0; y_subblock < block_height; y_subblock += subblock_height) { + for (size_t x_subblock = 0; x_subblock < block_width; x_subblock += subblock_width) { + for (size_t y = 0; y < subblock_height; ++y) { + for (size_t x = 0; x < subblock_width; ++x) { + const auto offset = (y_subblock + y) * full_width + x_block + x_subblock + x; + const auto imp_data = read_array(imp_ptr, offset); + const Data ref_data = in_roi ? read_array(ref_ptr, offset) : static_cast(0); + const auto [abs_err, rel_err] = calculate_error(imp_data, ref_data); + + if (abs_err != 0 || rel_err != 0) { + const auto notifying = !in_roi || handler.handle_data(abs_err, rel_err); + + if (notifying) { + const auto raw_index = y_block * block_height * block_width + offset; + KAI_LOGE( + "Mismatched data ", raw_index, ": actual = ", imp_data, + ", expected: ", ref_data); + } + } } } } } - - imp_ptr += block_data_bytes; - ref_ptr += block_data_bytes; } - // Checks the quantization scales. - for (size_t i = 0; i < block_height; ++i) { - const auto imp_scale = reinterpret_cast(imp_ptr)[i]; - const Scale ref_scale = in_roi ? reinterpret_cast(ref_ptr)[i] : 0; - const auto [abs_err, rel_err] = calculate_error(imp_scale, ref_scale); + imp_ptr += row_block_data_bytes; + ref_ptr += row_block_data_bytes; - if (abs_err != 0 || rel_err != 0) { - handler.mark_as_failed(); + // Checks the scales (if exists). + if constexpr (has_scale) { + for (size_t i = 0; i < block_height; ++i) { + const auto imp_scale = reinterpret_cast(imp_ptr)[i]; + const Scale ref_scale = in_roi ? reinterpret_cast(ref_ptr)[i] : 0; + const auto [abs_err, rel_err] = calculate_error(imp_scale, ref_scale); - const auto raw_row = group_no * block_height + i; - KAI_LOGE( - "Mismatched quantization scale ", raw_row, ": actual = ", imp_scale, ", expected: ", ref_scale); + if (abs_err != 0 || rel_err != 0) { + handler.mark_as_failed(); + + const auto raw_row = y_block + i; + KAI_LOGE( + "Mismatched quantization scale ", raw_row, ": actual = ", imp_scale, ", expected: ", ref_scale); + } } - } - imp_ptr += group_scales_bytes; - ref_ptr += group_scales_bytes; + imp_ptr += row_block_scales_bytes; + ref_ptr += row_block_scales_bytes; + } } return handler.success(rect.height() * full_width); @@ -176,19 +182,33 @@ bool compare( const auto scale_dt = format.scale_data_type(); const auto offset_dt = format.zero_point_data_type(); - switch (format.quantization_format()) { - case DataFormat::QuantizationFormat::NONE: + switch (format.pack_format()) { + case DataFormat::PackFormat::NONE: switch (data_type) { case DataType::FP32: return compare_raw(imp_data, ref_data, full_height, full_width, rect, handler); + case DataType::FP16: + return compare_raw(imp_data, ref_data, full_height, full_width, rect, handler); + default: break; } break; - case DataFormat::QuantizationFormat::PER_ROW: + case DataFormat::PackFormat::BIAS_PER_ROW: + if (data_type == DataType::FP16 && offset_dt == DataType::FP16) { + return compare_per_row( + imp_data, ref_data, format, full_height, full_width, rect, handler); + } else if (data_type == DataType::BF16 && offset_dt == DataType::FP32) { + return compare_per_row( + imp_data, ref_data, format, full_height, full_width, rect, handler); + } + + break; + + case DataFormat::PackFormat::QUANTIZE_PER_ROW: if (data_type == DataType::QAI8 && scale_dt == DataType::FP32 && offset_dt == DataType::I32) { return compare_per_row( imp_data, ref_data, format, full_height, full_width, rect, handler); diff --git a/test/common/data_format.cpp b/test/common/data_format.cpp index 1673b405..a69a085d 100644 --- a/test/common/data_format.cpp +++ b/test/common/data_format.cpp @@ -16,10 +16,10 @@ namespace kai::test { DataFormat::DataFormat( - DataType data_type, size_t block_height, size_t block_width, QuantizationFormat quant_format, DataType scale_dt, - DataType zero_point_dt, size_t subblock_height, size_t subblock_width) noexcept : + DataType data_type, size_t block_height, size_t block_width, PackFormat pack_format, DataType zero_point_dt, + DataType scale_dt, size_t subblock_height, size_t subblock_width) noexcept : _data_type(data_type), - _quant_format(quant_format), + _pack_format(pack_format), _scale_dt(scale_dt), _zero_point_dt(zero_point_dt), _block_height(block_height), @@ -29,7 +29,7 @@ DataFormat::DataFormat( } bool DataFormat::operator==(const DataFormat& rhs) const { - return _data_type == rhs._data_type && _quant_format == rhs._quant_format && _scale_dt == rhs._scale_dt && + return _data_type == rhs._data_type && _pack_format == rhs._pack_format && _scale_dt == rhs._scale_dt && _zero_point_dt == rhs._zero_point_dt && _block_height == rhs._block_height && _block_width == rhs._block_width; } @@ -41,8 +41,8 @@ DataType DataFormat::data_type() const { return _data_type; } -DataFormat::QuantizationFormat DataFormat::quantization_format() const { - return _quant_format; +DataFormat::PackFormat DataFormat::pack_format() const { + return _pack_format; } DataType DataFormat::scale_data_type() const { @@ -54,7 +54,7 @@ DataType DataFormat::zero_point_data_type() const { } bool DataFormat::is_raw() const { - return _quant_format == QuantizationFormat::NONE && // + return _pack_format == PackFormat::NONE && // _block_height == 0 && _block_width == 0 && _subblock_height == 0 && _subblock_width == 0; } @@ -74,46 +74,76 @@ size_t DataFormat::subblock_width() const { return _subblock_width; } +size_t DataFormat::actual_block_height(size_t full_height) const { + return _block_height > 0 ? _block_height + : round_up_multiple(full_height, _subblock_height > 0 ? _subblock_height : 1); +} + +size_t DataFormat::actual_block_width(size_t full_width) const { + return _block_width > 0 ? _block_width : round_up_multiple(full_width, _subblock_width > 0 ? _subblock_width : 1); +} + +size_t DataFormat::actual_subblock_height(size_t full_height) const { + return _subblock_height > 0 ? _subblock_height : actual_block_height(full_height); +} + +size_t DataFormat::actual_subblock_width(size_t full_width) const { + return _subblock_width > 0 ? _subblock_width : actual_block_width(full_width); +} + size_t DataFormat::scheduler_block_height([[maybe_unused]] size_t full_height) const { - switch (_quant_format) { - case QuantizationFormat::NONE: - return _block_height > 0 ? _block_height : 1; + const auto padded_block_height = round_up_multiple(_block_height, _subblock_height > 0 ? _subblock_height : 1); + + switch (_pack_format) { + case PackFormat::NONE: + return _block_height > 0 ? padded_block_height : 1; - case QuantizationFormat::PER_ROW: - return _block_height; + case PackFormat::BIAS_PER_ROW: + case PackFormat::QUANTIZE_PER_ROW: + KAI_ASSUME(_block_height > 0); + return padded_block_height; default: - KAI_ERROR("Unsupported quantization packing format!"); + KAI_ERROR("Unsupported packing format!"); } } size_t DataFormat::scheduler_block_width(size_t full_width) const { - switch (_quant_format) { - case QuantizationFormat::NONE: - return _block_width > 0 ? _block_width : 1; + const auto padded_block_width = round_up_multiple(_block_width, _subblock_width > 0 ? _subblock_width : 1); - case QuantizationFormat::PER_ROW: + switch (_pack_format) { + case PackFormat::NONE: + return _block_width > 0 ? padded_block_width : 1; + + case PackFormat::BIAS_PER_ROW: + case PackFormat::QUANTIZE_PER_ROW: return full_width; default: - KAI_ERROR("Unsupported quantization packing format!"); + KAI_ERROR("Unsupported packing format!"); } } uintptr_t DataFormat::default_row_stride(size_t width) const { - const auto padded_width = round_up_multiple(width, _block_width > 0 ? _block_width : 1); + const auto padded_width = round_up_multiple(width, actual_block_width(width)); - switch (_quant_format) { - case QuantizationFormat::NONE: + switch (_pack_format) { + case PackFormat::NONE: return padded_width * data_type_size_in_bits(_data_type) / 8; - case QuantizationFormat::PER_ROW: + case PackFormat::BIAS_PER_ROW: + KAI_ASSUME(_block_height > 0); + return _block_height * data_type_size_in_bits(_zero_point_dt) / 8 + // + _block_height * padded_width * data_type_size_in_bits(_data_type) / 8; + + case PackFormat::QUANTIZE_PER_ROW: + KAI_ASSUME(_block_height > 0); return _block_height * data_type_size_in_bits(_zero_point_dt) / 8 + // _block_height * padded_width * data_type_size_in_bits(_data_type) / 8 + // _block_height * data_type_size_in_bits(_scale_dt) / 8; default: - KAI_ERROR("Unsupported quantization packing format!"); + KAI_ERROR("Unsupported packing format!"); } } @@ -122,17 +152,18 @@ uintptr_t DataFormat::default_offset_in_bytes(size_t row, size_t col, size_t wid KAI_ASSERT(col % scheduler_block_width(width) == 0); - switch (_quant_format) { - case QuantizationFormat::NONE: + switch (_pack_format) { + case PackFormat::NONE: return row * row_stride + col * data_type_size_in_bits(_data_type) / 8; - case QuantizationFormat::PER_ROW: - KAI_ASSERT(row % _block_height == 0); - KAI_ASSERT(col == 0); - return (row / _block_height) * row_stride + col * data_type_size_in_bits(_data_type) / 8; + case PackFormat::BIAS_PER_ROW: + case PackFormat::QUANTIZE_PER_ROW: + KAI_ASSUME(row % _block_height == 0); + KAI_ASSUME(col == 0); + return (row / _block_height) * row_stride; default: - KAI_ERROR("Unsupported quantization packing format!"); + KAI_ERROR("Unsupported packing format!"); } } diff --git a/test/common/data_format.hpp b/test/common/data_format.hpp index 3aa19f71..1a9b9483 100644 --- a/test/common/data_format.hpp +++ b/test/common/data_format.hpp @@ -16,10 +16,11 @@ namespace kai::test { /// Data format. class DataFormat { public: - /// Quantization packing format. - enum class QuantizationFormat : uint32_t { - NONE, ///< No quantization information is included. - PER_ROW, ///< Per-row quantization. + /// Packing format. + enum class PackFormat : uint32_t { + NONE, ///< No quantization information is included. + BIAS_PER_ROW, ///< Per-row bias. + QUANTIZE_PER_ROW, ///< Per-row quantization. }; /// Creates a new data format. @@ -27,15 +28,15 @@ public: /// @param[in] data_type Data type of data value. /// @param[in] block_height Block height. /// @param[in] block_width Block width. - /// @param[in] quant_format Quantization packing format. - /// @param[in] scale_dt Data type of scale value. + /// @param[in] pack_format Packing format. /// @param[in] zero_point_dt Data type of zero point value. + /// @param[in] scale_dt Data type of scale value. /// @param[in] subblock_height Sub-block height. /// @param[in] subblock_width Sub-block width. DataFormat( - DataType data_type, size_t block_height = 0, size_t block_width = 0, - QuantizationFormat quant_format = QuantizationFormat::NONE, DataType scale_dt = DataType::UNKNOWN, - DataType zero_point_dt = DataType::UNKNOWN, size_t subblock_height = 0, size_t subblock_width = 0) noexcept; + DataType data_type, size_t block_height = 0, size_t block_width = 0, PackFormat pack_format = PackFormat::NONE, + DataType zero_point_dt = DataType::UNKNOWN, DataType scale_dt = DataType::UNKNOWN, size_t subblock_height = 0, + size_t subblock_width = 0) noexcept; /// Equality operator. [[nodiscard]] bool operator==(const DataFormat& rhs) const; @@ -43,8 +44,8 @@ public: /// Unequality operator. [[nodiscard]] bool operator!=(const DataFormat& rhs) const; - /// Gets the quantization packing format. - [[nodiscard]] QuantizationFormat quantization_format() const; + /// Gets the packing format. + [[nodiscard]] PackFormat pack_format() const; /// Gets the data type of data value. [[nodiscard]] DataType data_type() const; @@ -55,7 +56,7 @@ public: /// Gets the data type of zero point value. [[nodiscard]] DataType zero_point_data_type() const; - /// Gets a value indicating whether this format has no blocking or packed quantization information. + /// Gets a value indicating whether this format has no blocking or packing information. [[nodiscard]] bool is_raw() const; /// Gets the block height. @@ -70,6 +71,34 @@ public: /// Gets the sub-block width. [[nodiscard]] size_t subblock_width() const; + /// Gets the block height given the full height of the matrix. + /// + /// @param[in] full_height Height of the full matrix. + /// + /// @return The block height. + [[nodiscard]] size_t actual_block_height(size_t full_height) const; + + /// Gets the block width given the full width of the matrix. + /// + /// @param[in] full_width Width of the full matrix. + /// + /// @return The block width. + [[nodiscard]] size_t actual_block_width(size_t full_width) const; + + /// Gets the sub-block height given the full height of the matrix. + /// + /// @param[in] full_height Height of the full matrix. + /// + /// @return The sub-block height. + [[nodiscard]] size_t actual_subblock_height(size_t full_height) const; + + /// Gets the sub-block width given the full width of the matrix. + /// + /// @param[in] full_width Width of the full matrix. + /// + /// @return The sub-block width. + [[nodiscard]] size_t actual_subblock_width(size_t full_width) const; + /// Gets the scheduling block height. /// /// @param[in] full_height Height of the full matrix. @@ -86,7 +115,7 @@ public: /// Gets the row stride in bytes given the data is stored continuously without any gap in the memory. /// - /// In case of per-row quantization, the row stride is the number of bytes from one row group + /// In case of per-row bias or quantization, the row stride is the number of bytes from one row group /// to the next. One row group consists of `block_height` rows. /// /// @param[in] width Width of the full matrix. @@ -114,7 +143,7 @@ public: private: DataType _data_type; - QuantizationFormat _quant_format; + PackFormat _pack_format; DataType _scale_dt; DataType _zero_point_dt; size_t _block_height; diff --git a/test/common/data_type.cpp b/test/common/data_type.cpp index 79cc7e7c..95209750 100644 --- a/test/common/data_type.cpp +++ b/test/common/data_type.cpp @@ -69,13 +69,11 @@ bool data_type_is_signed(DataType dt) { } bool data_type_is_quantized(DataType dt) { - KAI_ASSERT_IF(has_q(dt), data_type_is_integral(dt)); - return has_q(dt); + return data_type_is_integral(dt) && has_q(dt); } bool data_type_is_quantized_asymm(DataType dt) { - KAI_ASSERT_IF(has_a(dt), data_type_is_quantized(dt)); - return has_a(dt); + return data_type_is_quantized(dt) && has_a(dt); } } // namespace kai::test diff --git a/test/common/data_type.hpp b/test/common/data_type.hpp index 1144e4d5..3c326cc2 100644 --- a/test/common/data_type.hpp +++ b/test/common/data_type.hpp @@ -39,6 +39,8 @@ enum class DataType : uint16_t { FP32 = 0b0'1'0'0'0000'00100000, ///< Single-precision floating-point. FP16 = 0b0'1'0'0'0000'00010000, ///< Half-precision floating-point. + BF16 = 0b0'1'1'0'0000'00010000, ///< Half-precision brain floating-point. + I32 = 0b1'1'0'0'0000'00100000, ///< 32-bit signed integer. QAI8 = 0b1'1'1'1'0000'00001000, ///< 8-bit signed asymmetric quantized. diff --git a/test/common/float16.cpp b/test/common/float16.cpp new file mode 100644 index 00000000..8280bb2d --- /dev/null +++ b/test/common/float16.cpp @@ -0,0 +1,17 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test/common/float16.hpp" + +#include + +namespace kai::test { + +std::ostream& operator<<(std::ostream& os, Float16 value) { + return os << static_cast(value); +} + +} // namespace kai::test diff --git a/test/common/float16.hpp b/test/common/float16.hpp new file mode 100644 index 00000000..5cb2ba93 --- /dev/null +++ b/test/common/float16.hpp @@ -0,0 +1,24 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace kai::test { + +/// Half-precision floating-point. +using Float16 = _Float16; + +/// Writes the value to the output stream. +/// +/// @param[in] os Output stream to be written to. +/// @param[in] value Value to be written. +/// +/// @return The output stream. +std::ostream& operator<<(std::ostream& os, Float16 value); + +} // namespace kai::test diff --git a/test/common/int4.cpp b/test/common/int4.cpp index bc9563b1..ae0a5998 100644 --- a/test/common/int4.cpp +++ b/test/common/int4.cpp @@ -9,13 +9,22 @@ #include #include +#include "kai_common.h" + namespace kai::test { UInt4& UInt4::operator=(uint8_t value) { + KAI_ASSUME(value >= 0 && value < 16); _value = value; return *this; } +UInt4& UInt4::operator=(int value) { + KAI_ASSUME(value >= 0 && value < 16); + _value = static_cast(value); + return *this; +} + UInt4::operator int32_t() const { return _value; } @@ -54,10 +63,17 @@ std::tuple UInt4::unpack_u8(uint8_t value) { // ===================================================================================================================== Int4& Int4::operator=(int8_t value) { + KAI_ASSUME(value >= -8 && value < 8); _value = value; return *this; } +Int4& Int4::operator=(int value) { + KAI_ASSUME(value >= -8 && value < 8); + _value = static_cast(value); + return *this; +} + Int4::operator int32_t() const { return _value; } diff --git a/test/common/int4.hpp b/test/common/int4.hpp index 169c5817..7f0e93a0 100644 --- a/test/common/int4.hpp +++ b/test/common/int4.hpp @@ -23,6 +23,9 @@ public: /// Assignment operator. UInt4& operator=(uint8_t value); + /// Assignment operator. + UInt4& operator=(int value); + /// Conversion operator. operator int32_t() const; @@ -72,6 +75,9 @@ public: /// Assignment operator. Int4& operator=(int8_t value); + /// Assignment operator. + Int4& operator=(int value); + /// Conversion operator. explicit operator int32_t() const; diff --git a/test/common/printer.cpp b/test/common/printer.cpp index 9bc7a100..320f7244 100644 --- a/test/common/printer.cpp +++ b/test/common/printer.cpp @@ -10,8 +10,10 @@ #include #include "src/kai_common.h" +#include "test/common/bfloat16.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/int4.hpp" namespace kai::test { @@ -36,6 +38,14 @@ inline void print_data(std::ostream& os, const uint8_t* data, size_t len, DataTy os << reinterpret_cast(data)[i]; break; + case DataType::FP16: + os << reinterpret_cast(data)[i]; + break; + + case DataType::BF16: + os << reinterpret_cast(data)[i]; + break; + case DataType::I32: os << reinterpret_cast(data)[i]; break; @@ -67,12 +77,16 @@ void print_matrix_raw(std::ostream& os, const uint8_t* data, DataType data_type, void print_matrix_per_row( std::ostream& os, const uint8_t* data, const DataFormat& format, size_t height, size_t width) { - const auto block_height = format.block_height(); + const auto has_scale = format.pack_format() == DataFormat::PackFormat::QUANTIZE_PER_ROW; + + const auto block_height = format.actual_block_height(height); + const auto num_blocks = (height + block_height - 1) / block_height; - const auto block_data_bytes = block_height * width * data_type_size_in_bits(format.data_type()) / 8; + KAI_ASSUME(format.default_size_in_bytes(height, width) % num_blocks == 0); + const auto block_data_bytes = format.default_size_in_bytes(height, width) / num_blocks; const auto block_offsets_bytes = block_height * data_type_size_in_bits(format.zero_point_data_type()) / 8; - const auto block_scales_bytes = block_height * data_type_size_in_bits(format.scale_data_type()) / 8; + const auto block_scales_bytes = has_scale ? block_height * data_type_size_in_bits(format.scale_data_type()) / 8 : 0; os << "[\n"; for (size_t y = 0; y < num_blocks; ++y) { @@ -80,8 +94,12 @@ void print_matrix_per_row( print_data(os, data, block_height, format.zero_point_data_type()); os << "], \"data\": ["; print_data(os, data + block_offsets_bytes, block_height * width, format.data_type()); - os << "], \"scales\": ["; - print_data(os, data + block_offsets_bytes + block_data_bytes, block_height, format.scale_data_type()); + + if (has_scale) { + os << "], \"scales\": ["; + print_data(os, data + block_offsets_bytes + block_data_bytes, block_height, format.scale_data_type()); + } + os << "]},\n"; data += block_offsets_bytes + block_data_bytes + block_scales_bytes; @@ -95,12 +113,13 @@ void print_matrix( std::ostream& os, std::string_view name, const void* data, const DataFormat& format, size_t height, size_t width) { os << name << " = "; - switch (format.quantization_format()) { - case DataFormat::QuantizationFormat::NONE: + switch (format.pack_format()) { + case DataFormat::PackFormat::NONE: print_matrix_raw(os, reinterpret_cast(data), format.data_type(), height, width); break; - case DataFormat::QuantizationFormat::PER_ROW: + case DataFormat::PackFormat::BIAS_PER_ROW: + case DataFormat::PackFormat::QUANTIZE_PER_ROW: print_matrix_per_row(os, reinterpret_cast(data), format, height, width); break; diff --git a/test/common/type_traits.hpp b/test/common/type_traits.hpp index 2267432a..00559fc0 100644 --- a/test/common/type_traits.hpp +++ b/test/common/type_traits.hpp @@ -13,6 +13,7 @@ namespace kai::test { class UInt4; class Int4; +class BFloat16; /// `true` if `T` is unsigned numeric type. template @@ -26,6 +27,10 @@ inline constexpr bool is_unsigned = true; template <> inline constexpr bool is_unsigned = true; +/// `true` if `T` is unsigned numeric type. +template <> +inline constexpr bool is_unsigned = false; + /// `true` if `T` is signed numeric type. template inline constexpr bool is_signed = std::is_signed_v; @@ -38,6 +43,10 @@ inline constexpr bool is_signed = false; template <> inline constexpr bool is_signed = false; +/// `true` if `T` is signed numeric type. +template <> +inline constexpr bool is_signed = true; + /// `true` if `T` is integral numeric type. template inline constexpr bool is_integral = std::is_integral_v; @@ -50,10 +59,22 @@ inline constexpr bool is_integral = true; template <> inline constexpr bool is_integral = true; +/// `true` if `T` is integral numeric type. +template <> +inline constexpr bool is_integral = false; + /// `true` if `T` is floating-point type. template inline constexpr bool is_floating_point = std::is_floating_point_v; +/// `true` if `T` is floating-point type. +template <> +inline constexpr bool is_floating_point = true; + +/// `true` if `T` is integral or floating-point type. +template +inline constexpr bool is_arithmetic = is_integral || is_floating_point; + /// Signed version of type `T`. template struct make_signed { diff --git a/test/reference/binary_elementwise.cpp b/test/reference/binary_elementwise.cpp index eda5d8ae..c5c61a66 100644 --- a/test/reference/binary_elementwise.cpp +++ b/test/reference/binary_elementwise.cpp @@ -13,6 +13,7 @@ #include "src/kai_common.h" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/int4.hpp" #include "test/common/memory.hpp" @@ -105,6 +106,9 @@ std::vector binary_elementwise_any_type( case DataType::FP32: return binary_elementwise_any_op_type(lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width); + case DataType::FP16: + return binary_elementwise_any_op_type(lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width); + case DataType::I32: return binary_elementwise_any_op_type(lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width); diff --git a/test/reference/cast.cpp b/test/reference/cast.cpp new file mode 100644 index 00000000..1eae56df --- /dev/null +++ b/test/reference/cast.cpp @@ -0,0 +1,44 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include "kai_common.h" +#include "test/common/bfloat16.hpp" +#include "test/common/data_type.hpp" +#include "test/common/memory.hpp" + +namespace kai::test { + +namespace { + +template +std::vector cast_any_type(const void* src, size_t length) { + std::vector dst; + dst.resize(length * size_in_bits / 8); + + for (size_t i = 0; i < length; ++i) { + write_array(dst.data(), i, static_cast(read_array(src, i))); + } + + return dst; +} + +} // namespace + +std::vector cast(const void* src, kai::test::DataType src_dt, DataType dst_dt, size_t height, size_t width) { + const auto length = height * width; + + if (src_dt == DataType::BF16 && dst_dt == DataType::FP32) { + return cast_any_type(src, length); + } + + KAI_ERROR("Unsupported cast data type!"); +} + +} // namespace kai::test diff --git a/test/reference/cast.hpp b/test/reference/cast.hpp new file mode 100644 index 00000000..1ef3b9db --- /dev/null +++ b/test/reference/cast.hpp @@ -0,0 +1,28 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "test/common/data_type.hpp" + +namespace kai::test { + +/// Converts each element of the source matrix to the new data type. +/// +/// @param[in] src Source matrix data buffer. +/// @param[in] src_dt Data type of the source matrix. +/// @arapm[in] dst_dt Data type of the destination matrix. +/// @param[in] height Number of rows. +/// @param[in] width Number of columns. +/// +/// @return The result matrix containing data in the destination data type. +std::vector cast(const void* src, DataType src_dt, DataType dst_dt, size_t height, size_t width); + +} // namespace kai::test diff --git a/test/reference/fill.cpp b/test/reference/fill.cpp index d0fd8b5c..238397c0 100644 --- a/test/reference/fill.cpp +++ b/test/reference/fill.cpp @@ -14,8 +14,10 @@ #include #include "src/kai_common.h" +#include "test/common/bfloat16.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/int4.hpp" #include "test/common/memory.hpp" @@ -52,6 +54,22 @@ std::vector fill_matrix_random_raw(size_t height, size_t width, uint64_ return fill_matrix_raw(height, width, [&](size_t, size_t) { return dist(rnd); }); } +template <> +std::vector fill_matrix_random_raw(size_t height, size_t width, uint64_t seed) { + std::mt19937 rnd(seed); + std::uniform_real_distribution dist; + + return fill_matrix_raw(height, width, [&](size_t, size_t) { return static_cast(dist(rnd)); }); +} + +template <> +std::vector fill_matrix_random_raw(size_t height, size_t width, uint64_t seed) { + std::mt19937 rnd(seed); + std::uniform_real_distribution dist; + + return fill_matrix_raw(height, width, [&](size_t, size_t) { return static_cast(dist(rnd)); }); +} + template <> std::vector fill_matrix_random_raw(size_t height, size_t width, uint64_t seed) { std::mt19937 rnd(seed); @@ -71,12 +89,18 @@ std::vector fill_matrix_random_raw(size_t height, size_t width, } // namespace std::vector fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint64_t seed) { - switch (format.quantization_format()) { - case DataFormat::QuantizationFormat::NONE: + switch (format.pack_format()) { + case DataFormat::PackFormat::NONE: switch (format.data_type()) { case DataType::FP32: return fill_matrix_random_raw(height, width, seed); + case DataType::FP16: + return fill_matrix_random_raw(height, width, seed); + + case DataType::BF16: + return fill_matrix_random_raw(height, width, seed); + case DataType::QSU4: return fill_matrix_random_raw(height, width, seed); diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index 6b2ab041..d666d903 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -15,11 +15,12 @@ #include "test/common/data_type.hpp" #include "test/common/int4.hpp" #include "test/common/memory.hpp" -#include "test/common/printer.hpp" #include "test/reference/binary_elementwise.hpp" +#include "test/reference/cast.hpp" #include "test/reference/pack.hpp" #include "test/reference/quantize.hpp" #include "test/reference/reduce.hpp" +#include "test/reference/transpose.hpp" namespace kai::test { @@ -74,19 +75,24 @@ std::vector matmul_any_type( std::vector matmul_pack_rhs( const void* data, const void* scales, const void* zero_points, const DataFormat& src_format, - const DataFormat& dst_format, size_t height, size_t width) { + const DataFormat& dst_format, size_t n, size_t k, bool transposing) { const auto src_dt = src_format.data_type(); - const auto src_qf = src_format.quantization_format(); + const auto src_pf = src_format.pack_format(); const auto dst_dt = dst_format.data_type(); - const auto dst_qf = dst_format.quantization_format(); + const auto dst_pf = dst_format.pack_format(); std::vector tmp_data; std::vector tmp_scales; std::vector tmp_zero_points; - if (src_dt == DataType::QSU4 && src_qf == DataFormat::QuantizationFormat::NONE && // - dst_dt == DataType::QSI4 && dst_qf == DataFormat::QuantizationFormat::PER_ROW) { + if (transposing) { + tmp_data = transpose(data, src_dt, k, n); + data = tmp_data.data(); + } + + if (src_dt == DataType::QSU4 && src_pf == DataFormat::PackFormat::NONE && // + dst_dt == DataType::QSI4 && dst_pf == DataFormat::PackFormat::QUANTIZE_PER_ROW) { // For this specific RHS format conversion: // // * 4-bit data is added by 8. @@ -96,33 +102,33 @@ std::vector matmul_pack_rhs( KAI_ASSUME(zero_points == nullptr); const int32_t zero_point = 8; const uint8_t zero_point_i4 = UInt4::pack_u8(UInt4(zero_point), UInt4(zero_point)); - const int32_t row_zero_point = zero_point * static_cast(width); + const int32_t row_zero_point = zero_point * static_cast(k); KAI_ASSUME(dst_format.subblock_width() > 0); const auto subblock_width_i32 = static_cast(dst_format.subblock_width()); const auto subblock_width_f = static_cast(dst_format.subblock_width()); - tmp_zero_points = reduce_add(data, src_format, height, width, DataFormat(DataType::I32), 0); - tmp_zero_points = sub(tmp_zero_points.data(), DataType::I32, height, 1, &row_zero_point, DataType::I32, 1, 1); - tmp_zero_points = - mul(tmp_zero_points.data(), DataType::I32, height, 1, &subblock_width_i32, DataType::I32, 1, 1); + tmp_zero_points = reduce_add(data, src_format, n, k, DataFormat(DataType::I32), 0); + tmp_zero_points = sub(tmp_zero_points.data(), DataType::I32, n, 1, &row_zero_point, DataType::I32, 1, 1); + tmp_zero_points = mul(tmp_zero_points.data(), DataType::I32, n, 1, &subblock_width_i32, DataType::I32, 1, 1); zero_points = tmp_zero_points.data(); - tmp_data = add(data, DataType::QSU4, height, width, &zero_point_i4, DataType::QSU4, 1, 1); + tmp_data = add(data, DataType::QSU4, n, k, &zero_point_i4, DataType::QSU4, 1, 1); data = tmp_data.data(); - tmp_scales = div(scales, DataType::FP32, height, 1, &subblock_width_f, DataType::FP32, 1, 1); + tmp_scales = div(scales, DataType::FP32, n, 1, &subblock_width_f, DataType::FP32, 1, 1); scales = tmp_scales.data(); } - return pack(dst_format, data, scales, zero_points, src_format, height, width); + return pack(dst_format, data, scales, zero_points, src_format, n, k); } std::vector matmul( - const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // - const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // - DataType dst_dt, // - size_t m, size_t n, size_t k, // + const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // + const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // + const void* bias, const void* bias_scales, const void* bias_zero_points, DataType bias_dt, // + DataType dst_dt, // + size_t m, size_t n, size_t k, // bool lhs_transposed, bool rhs_transposed) { const auto lhs_h = lhs_transposed ? k : m; const auto lhs_w = lhs_transposed ? m : k; @@ -132,6 +138,8 @@ std::vector matmul( std::vector tmp_lhs; std::vector tmp_rhs; + std::vector tmp_dst; + std::vector tmp_bias; if (data_type_is_quantized(lhs_dt)) { tmp_lhs = dequantize( @@ -145,8 +153,41 @@ std::vector matmul( rhs = tmp_rhs.data(); } - KAI_ASSUME(dst_dt == DataType::FP32); - const auto tmp_dst = matmul_any_type(lhs, rhs, m, n, k, lhs_transposed, rhs_transposed); + if (lhs_dt != dst_dt) { + tmp_lhs = cast(lhs, lhs_dt, dst_dt, lhs_h, lhs_w); + lhs = tmp_lhs.data(); + } + + if (rhs_dt != dst_dt) { + tmp_rhs = cast(rhs, rhs_dt, dst_dt, rhs_h, rhs_w); + rhs = tmp_rhs.data(); + } + + switch (dst_dt) { + case DataType::FP32: + tmp_dst = matmul_any_type(lhs, rhs, m, n, k, lhs_transposed, rhs_transposed); + break; + + case DataType::FP16: + tmp_dst = matmul_any_type<_Float16>(lhs, rhs, m, n, k, lhs_transposed, rhs_transposed); + break; + + default: + KAI_ERROR("Unknown data type!"); + } + + if (bias != nullptr) { + if (bias_dt != dst_dt) { + tmp_bias = cast(bias, bias_dt, dst_dt, 1, n); + bias = tmp_bias.data(); + } + + KAI_ASSUME(!data_type_is_quantized(bias_dt)); + KAI_ASSUME(bias_scales == nullptr); + KAI_ASSUME(bias_zero_points == nullptr); + + tmp_dst = add(tmp_dst.data(), dst_dt, m, n, bias, bias_dt, 1, n); + } return tmp_dst; } diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp index 7dfca92c..e71f17c2 100644 --- a/test/reference/matmul.hpp +++ b/test/reference/matmul.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include "test/common/data_type.hpp" @@ -24,13 +23,14 @@ class DataFormat; /// @param[in] zero_points (Optional) Quantization zero points. /// @param[in] src_format Data format of the RHS matrix. /// @param[in] dst_format Data format of the packed RHS matrix. -/// @param[in] height Number of rows. -/// @param[in] width Number of columns. +/// @param[in] n Number of non-transposed columns. +/// @param[in] k Number of non-transposed rows. +/// @param[in] transposing Perform transpose then pack. /// /// @return The packed RHS matrix. std::vector matmul_pack_rhs( const void* data, const void* scales, const void* zero_points, const DataFormat& src_format, - const DataFormat& dst_format, size_t height, size_t width); + const DataFormat& dst_format, size_t n, size_t k, bool transposing); /// Matrix multiplication. /// @@ -38,10 +38,17 @@ std::vector matmul_pack_rhs( /// @param[in] lhs_scales (Optional) LHS operand quantization scales. /// @param[in] lhs_zero_points (Optional) LHS operand quantization zero point. /// @param[in] lhs_dt LHS operand data type. -/// @param[in] dst LHS operand data. -/// @param[in] dst_scales (Optional) LHS operand quantization scales. -/// @param[in] dst_zero_points (Optional) LHS operand quantization zero point. -/// @param[in] dst_dt LHS operand data type. +/// @param[in] rhs RHS operand data. +/// @param[in] rhs_scales (Optional) RHS operand quantization scales. +/// @param[in] rhs_zero_points (Optional) RHS operand quantization zero point. +/// @param[in] rhs_dt RHS operand data type. +/// @param[in] bias Bias operand data. +/// @param[in] bias_scales (Optional) Bias operand quantization scales. +/// @param[in] bias_zero_points (Optional) Bias operand quantization zero point. +/// @param[in] bias_dt Bias operand data type. +/// @param[in] dst Output data. +/// @param[in] dst_scales (Optional) Output quantization scales. +/// @param[in] dst_zero_points (Optional) Output quantization zero point. /// @param[in] dst_dt Output data type. /// @param[in] m Output height. /// @param[in] n Output width. @@ -51,10 +58,11 @@ std::vector matmul_pack_rhs( /// /// @return The result data buffer. std::vector matmul( - const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // - const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // - DataType dst_dt, // - size_t m, size_t n, size_t k, // + const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // + const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // + const void* bias, const void* bias_scales, const void* bias_zero_points, DataType bias_dt, // + DataType dst_dt, // + size_t m, size_t n, size_t k, // bool lhs_transposed, bool rhs_transposed); } // namespace kai::test diff --git a/test/reference/pack.cpp b/test/reference/pack.cpp index 8a596ed2..a61efff8 100644 --- a/test/reference/pack.cpp +++ b/test/reference/pack.cpp @@ -6,9 +6,11 @@ #include "test/reference/pack.hpp" +#include #include #include #include +#include #include #include @@ -16,11 +18,62 @@ #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" #include "test/reference/quantize.hpp" +#include "test/reference/round.hpp" namespace kai::test { namespace { +/// Packs the matrix from raw to per-row bias format. +std::vector pack_bias_per_row( + size_t data_esize, size_t zero_point_esize, const void* src, const void* bias, size_t height, size_t width, + size_t block_height, size_t block_width, size_t subblock_height, size_t subblock_width) { + const auto num_groups = (height + block_height - 1) / block_height; + const auto group_num_blocks = (width + block_width - 1) / block_width; + + const auto group_zero_points_bytes = block_height * zero_point_esize; + const auto block_data_bytes = block_height * block_width * data_esize; + const auto group_bytes = group_zero_points_bytes + group_num_blocks * block_data_bytes; + const auto dst_bytes = num_groups * group_bytes; + + std::vector dst; + dst.resize(dst_bytes); + + const auto* src_ptr = reinterpret_cast(src); + const auto* bias_ptr = reinterpret_cast(bias); + auto* dst_ptr = dst.data(); + + for (size_t y_block = 0; y_block < height; y_block += block_height) { + // Packs the zero points. + const auto bias_len = std::min(block_height, height - y_block); + memcpy(dst_ptr, bias_ptr, bias_len * zero_point_esize); + bias_ptr += block_height * zero_point_esize; + dst_ptr += block_height * zero_point_esize; + + for (size_t x_block = 0; x_block < width; x_block += block_width) { + for (size_t y_subblock = 0; y_subblock < block_height; y_subblock += subblock_height) { + for (size_t x_subblock = 0; x_subblock < block_width; x_subblock += subblock_width) { + for (size_t y_element = 0; y_element < subblock_height; ++y_element) { + if (y_block + y_subblock + y_element < height) { + const auto len = std::min(subblock_width, width - x_block - x_subblock); + memcpy( + dst_ptr, + src_ptr + + ((y_block + y_subblock + y_element) * width + x_block + x_subblock) * data_esize, + len * data_esize); + } + dst_ptr += subblock_width * data_esize; + } + } + } + } + } + + KAI_ASSERT(reinterpret_cast(dst_ptr) - reinterpret_cast(dst.data()) == dst_bytes); + + return dst; +} + /// Packs the matrix from raw to quantized format. template std::vector pack_quant_per_row( @@ -182,21 +235,37 @@ std::vector pack( const DataFormat& dst_format, const void* src, const void* scales, const void* zero_points, const DataFormat& src_format, size_t height, size_t width) { const auto dst_dt = dst_format.data_type(); - const auto dst_qf = dst_format.quantization_format(); + const auto dst_qf = dst_format.pack_format(); const auto src_dt = src_format.data_type(); - const auto src_qf = src_format.quantization_format(); + const auto src_qf = src_format.pack_format(); + + const auto block_height = dst_format.actual_block_height(height); + const auto block_width = dst_format.actual_block_width(width); + const auto subblock_height = dst_format.actual_subblock_height(height); + const auto subblock_width = dst_format.actual_subblock_width(width); - if (src_qf == DataFormat::QuantizationFormat::NONE && dst_qf == DataFormat::QuantizationFormat::PER_ROW) { + if (src_qf == DataFormat::PackFormat::NONE && dst_qf == DataFormat::PackFormat::QUANTIZE_PER_ROW) { if (dst_dt == DataType::QAI8 && src_dt == DataType::FP32 && dst_format.scale_data_type() == DataType::FP32 && dst_format.zero_point_data_type() == DataType::I32) { - return pack_quant_per_row( - src, height, width, dst_format.block_height(), dst_format.block_width()); + return pack_quant_per_row(src, height, width, block_height, block_width); } else if ( dst_dt == DataType::QSI4 && src_dt == DataType::QSU4 && dst_format.scale_data_type() == DataType::FP32 && dst_format.zero_point_data_type() == DataType::I32) { return pack_per_row_qs4( - src, scales, zero_points, height, width, dst_format.block_height(), dst_format.block_width(), - dst_format.subblock_height(), dst_format.subblock_width()); + src, scales, zero_points, height, width, block_height, block_width, subblock_height, subblock_width); + } + } + + if (src_qf == DataFormat::PackFormat::NONE && dst_qf == DataFormat::PackFormat::BIAS_PER_ROW) { + KAI_ASSUME(src_dt == dst_dt); + + const auto data_esize = data_type_size_in_bits(dst_dt); + const auto zero_point_esize = data_type_size_in_bits(dst_format.zero_point_data_type()); + + if (data_esize % 8 == 0 && zero_point_esize % 8 == 0) { + return pack_bias_per_row( + data_esize / 8, zero_point_esize / 8, src, zero_points, height, width, block_height, block_width, + subblock_height, subblock_width); } } diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp new file mode 100644 index 00000000..74c87c04 --- /dev/null +++ b/test/reference/transpose.cpp @@ -0,0 +1,38 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test/reference/transpose.hpp" + +#include +#include +#include + +#include "kai_common.h" +#include "test/common/data_type.hpp" + +namespace kai::test { + +std::vector transpose(const void* data, DataType data_type, size_t height, size_t width) { + KAI_ASSUME(data_type_size_in_bits(data_type) % 8 == 0); + const auto element_size = data_type_size_in_bits(data_type) / 8; + + std::vector output; + output.resize(height * width * element_size); + + const auto* src_ptr = reinterpret_cast(data); + + for (size_t y = 0; y < width; ++y) { + for (size_t x = 0; x < height; ++x) { + memcpy( + output.data() + (y * height + x) * element_size, src_ptr + (x * width + y) * element_size, + element_size); + } + } + + return output; +} + +} // namespace kai::test diff --git a/test/reference/transpose.hpp b/test/reference/transpose.hpp new file mode 100644 index 00000000..2c2f6a83 --- /dev/null +++ b/test/reference/transpose.hpp @@ -0,0 +1,27 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "test/common/data_type.hpp" + +namespace kai::test { + +/// Transposes the matrix. +/// +/// @param[in] data Data buffer. +/// @param[in] data_type Element data type. +/// @param[in] height Number of rows. +/// @param[in] width Number of columns. +/// +/// @return The transposed matrix. +std::vector transpose(const void* data, DataType data_type, size_t height, size_t width); + +} // namespace kai::test diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index dcafa8e0..f20ead47 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -15,14 +15,20 @@ #include #include #include +#include #include #include #include #include "src/kai_common.h" +#include "src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h" +#include "src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h" +#include "src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h" +#include "src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h" #include "test/common/compare.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/matrix_portion.hpp" #include "test/common/printer.hpp" #include "test/reference/fill.hpp" @@ -34,6 +40,8 @@ namespace kai::test { /// Matrix multiplication method. struct MatMulMethod { + std::string_view name; ///< Name of matmul method. + size_t m0; ///< Block size in M dimension. size_t n0; ///< Block size in N dimension. size_t k0; ///< Block size in K dimension. @@ -45,15 +53,17 @@ struct MatMulMethod { DataFormat lhs_format; ///< Data format of the LHS matrix. DataFormat packed_lhs_format; ///< Data format of the packed LHS matrix. DataFormat rhs_format; ///< Data format of the RHS matrix. - DataFormat packed_rhs_format; ///< Data for mat of the packed RHS matrix. + DataFormat packed_rhs_format; ///< Data format of the packed RHS matrix. + DataFormat bias_format; ///< Data format of the bias vector. /// Gets the offset in bytes of the LHS matrix. /// /// @param[in] m_idx Coordinate of the matrix in M dimension. + /// @param[in] k_idx Coordinate of the matrix in K dimension. /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes. - std::function fn_get_lhs_offset; + std::function fn_get_lhs_offset; /// Gets the size in bytes of the packed LHS matrix. /// @@ -80,54 +90,61 @@ struct MatMulMethod { /// @param[out] packed_lhs Packed LHS matrix data buffer. std::function fn_pack_lhs; + [[nodiscard]] bool is_pack_lhs_needed() const { + return fn_pack_lhs != nullptr; + } + /// Gets the offset in bytes of the RHS matrix. /// /// @param[in] n_idx Coordinate of the matrix in N dimension. + /// @param[in] k_idx Coordinate of the matrix in K dimension. /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes. - std::function fn_get_rhs_offset; + std::function fn_get_rhs_offset; /// Gets the size in bytes of the packed RHS matrix. /// /// @param[in] n Size of the matrix in N dimension. /// @param[in] k Size of the matrix in K dimension. - /// @param[in] block_height Block height. - /// @param[in] block_width Block width. /// /// @return The size in bytes. - std::function fn_get_packed_rhs_size; + std::function fn_get_packed_rhs_size; /// Gets the offset in bytes of the packed RHS matrix. /// - /// @param[in] n_idx Coordinate of the matrix in N dimension. /// @param[in] k Size of the matrix in K dimension. - /// @param[in] block_height Block height. - /// @param[in] block_width Block width. + /// @param[in] n_idx Coordinate of the matrix in N dimension. + /// @param[in] k_idx Coordinate of the matrix in K dimension. /// /// @return The offset in bytes. - std::function fn_get_packed_rhs_offset; + std::function fn_get_packed_rhs_offset; + + std::function + fn_pack_rhs; /// Performs matrix multiplication. /// /// @param[in] m Size of the matrix in M dimension. /// @param[in] n Size of the matrix in N dimension. /// @param[in] k Size of the matrix in K dimension. - /// @param[in] lhs_p Packed LHS data buffer. - /// @param[in] rhs_p Packed RHS data buffer. + /// @param[in] lhs LHS data buffer. + /// @param[in] packed_rhs Packed RHS data buffer. /// @param[out] dst Output data buffer. - /// @param[in] dst_stride_row Output row stride. - /// @param[in] dst_stride_col Output column stride. - /// @param[in] scalar_min Lower bound of the output data. - /// @param[in] scalar_max Upper bound of the output data. + /// @param[in] lhs_stride LHS row stride. + /// @param[in] dst_stride Output row stride. + /// @param[in] clamp_min Lower bound of the output data. + /// @param[in] clamp_max Upper bound of the output data. std::function - fn_main; + size_t m, size_t n, size_t k, // + const void* lhs, const void* packed_rhs, void* dst, // + size_t lhs_stride, size_t dst_stride, // + Float16 clamp_min, Float16 clamp_max)> + fn_main_hybrid_fp16; /// Gets a value indicating whether pre-processing the RHS matrix is needed. [[nodiscard]] bool is_pack_rhs_needed() const { - return false; + return fn_pack_rhs != nullptr; } /// Preprocesses the RHS matrix. @@ -150,7 +167,28 @@ struct MatMulMethod { KAI_UNUSED(scale); KAI_UNUSED(packed_rhs); - KAI_ERROR("RHS pre-processing is not supported!"); + if (fn_pack_rhs != nullptr) { + fn_pack_rhs(n, k, rhs, bias, packed_rhs, rhs_row_stride); + } else { + KAI_ERROR("RHS pre-processing is not supported!"); + } + } + + [[nodiscard]] bool has_main_kernel() const { + return fn_main_hybrid_fp16 != nullptr; + } + + void main_kernel( + size_t m, size_t n, size_t k, const void* lhs, const void* rhs, const void* bias, void* dst, size_t lhs_stride, + size_t rhs_stride, size_t dst_stride, float clamp_min, float clamp_max) const { + KAI_UNUSED(bias); + KAI_UNUSED(rhs_stride); + + if (fn_main_hybrid_fp16) { + fn_main_hybrid_fp16(m, n, k, lhs, rhs, dst, lhs_stride, dst_stride, clamp_min, clamp_max); + } else { + KAI_ERROR("Main kernel is not available!"); + } } }; @@ -159,31 +197,66 @@ struct MatMulMethod { /// List of supported matrix multiplication methods. static const std::array matmul_methods = { MatMulMethod{ - .m0 = 4, - .n0 = 4, - .k0 = 32, + .name = "matmul_nt_nt_fp16_fp16_fp16_6x16_neon_mla", + + .m0 = 6, + .n0 = 16, + .k0 = 0, // Not applicable. .lhs_transposed = false, - .rhs_transposed = true, + .rhs_transposed = false, + + .dst_format = DataFormat(DataType::FP16), + .lhs_format = DataFormat(DataType::FP16), + .packed_lhs_format = DataFormat(DataType::UNKNOWN), + .rhs_format = DataFormat(DataType::FP16), + .packed_rhs_format = DataFormat( + DataType::FP16, 16, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP16, DataType::UNKNOWN, 16, 1), + .bias_format = DataFormat(DataType::FP16), + + .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla, + .fn_get_packed_lhs_size = nullptr, + .fn_get_packed_lhs_offset = nullptr, + .fn_pack_lhs = nullptr, + + .fn_get_rhs_offset = kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, + .fn_get_packed_rhs_size = kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, + .fn_get_packed_rhs_offset = kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, + .fn_pack_rhs = kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, + + .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla, + }, + + MatMulMethod{ + .name = "matmul_nt_nt_f32_bf16_bf16_6x16_neon_mmla", + + .m0 = 6, + .n0 = 16, + .k0 = 0, // Not applicable. + + .lhs_transposed = false, + .rhs_transposed = false, .dst_format = DataFormat(DataType::FP32), - .lhs_format = DataFormat(DataType::FP32), - .packed_lhs_format = - DataFormat(DataType::QAI8, 4, 8, DataFormat::QuantizationFormat::PER_ROW, DataType::FP32, DataType::I32), - .rhs_format = DataFormat(DataType::QSU4), + .lhs_format = DataFormat(DataType::BF16), + .packed_lhs_format = DataFormat(DataType::UNKNOWN), + .rhs_format = DataFormat(DataType::BF16), .packed_rhs_format = DataFormat( - DataType::QSI4, 4, 32, DataFormat::QuantizationFormat::PER_ROW, DataType::FP32, DataType::I32, 1, 16), + DataType::BF16, 16, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP32, DataType::UNKNOWN, 16, 4), + .bias_format = DataFormat(DataType::FP32), - .fn_get_lhs_offset = nullptr, + .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla, .fn_get_packed_lhs_size = nullptr, .fn_get_packed_lhs_offset = nullptr, .fn_pack_lhs = nullptr, - .fn_get_rhs_offset = nullptr, - .fn_get_packed_rhs_size = nullptr, - .fn_get_packed_rhs_offset = nullptr, + .fn_get_rhs_offset = kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, + .fn_get_packed_rhs_size = kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, + .fn_get_packed_rhs_offset = + kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, + .fn_pack_rhs = kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, - .fn_main = nullptr, + .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla, }, }; @@ -195,15 +268,16 @@ struct MatMulShape { }; /// Matrix multiplication test information. -using MatMulTestParams = std::tuple; +using MatMulTestParams = std::tuple; /// Prints the test information. void PrintTo(const MatMulTestParams& param, std::ostream* os) { - const auto& [shape, method_no, portion] = param; + const auto& [method_no, shape, portion] = param; - *os << "m: " << shape.m << ", n: " << shape.n << ", k: " << shape.k << ", method_no: " << method_no - << ", portion: { start_row: " << portion.start_row() << ", start_col: " << portion.start_col() - << ", height: " << portion.height() << ", width: " << portion.width() << "}"; + *os << "method: " << matmul_methods[method_no].name << ", m: " << shape.m << ", n: " << shape.n + << ", k: " << shape.k << ", portion: { start_row: " << portion.start_row() + << ", start_col: " << portion.start_col() << ", height: " << portion.height() << ", width: " << portion.width() + << "}"; } /// Matrix multiplication test fixture. @@ -219,13 +293,14 @@ protected: std::vector ref_packed_lhs{}; ///< Reference packed LHS. std::vector rhs{}; ///< RHS operand. std::vector rhs_scales{}; ///< RHS per-row quantization scales. + std::vector bias{}; ///< Bias. std::vector ref_packed_rhs{}; ///< Reference packed RHS. std::vector ref_dst{}; ///< Reference output. }; /// Gets the test data for the current test case. static const TestData& test_data() { - const auto& [info, method_no, portion] = GetParam(); + const auto& [method_no, info, portion] = GetParam(); const TestDataId data_id{info.m, info.n, info.k, method_no}; // If the test data is already available, returns it. @@ -238,11 +313,18 @@ protected: // Generates the test data. const auto& method = matmul_methods.at(method_no); + const auto has_lhs_pack = method.packed_lhs_format.data_type() != DataType::UNKNOWN; + const auto has_rhs_pack = method.packed_rhs_format.data_type() != DataType::UNKNOWN; + const auto has_bias = method.bias_format.data_type() != DataType::UNKNOWN; + const auto lhs_h = method.lhs_transposed ? info.k : info.m; const auto lhs_w = method.lhs_transposed ? info.m : info.k; auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0); - auto ref_packed_lhs = + std::vector ref_packed_lhs; + + if (has_lhs_pack) { pack(method.packed_lhs_format, lhs.data(), nullptr, nullptr, method.lhs_format, lhs_h, lhs_w); + } const auto rhs_h = method.rhs_transposed ? info.n : info.k; const auto rhs_w = method.rhs_transposed ? info.k : info.n; @@ -250,13 +332,24 @@ protected: std::vector rhs_scales; if (data_type_is_quantized(method.rhs_format.data_type()) && - method.rhs_format.quantization_format() == DataFormat::QuantizationFormat::NONE) { + method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) { rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2); } - auto packed_rhs = matmul_pack_rhs( - rhs.data(), !rhs_scales.empty() ? rhs_scales.data() : nullptr, nullptr, method.rhs_format, - method.packed_rhs_format, rhs_h, rhs_w); + const auto bias_h = 1; + const auto bias_w = info.n; + std::vector bias; + + if (has_bias) { + bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3); + } + + std::vector packed_rhs; + if (has_rhs_pack) { + packed_rhs = matmul_pack_rhs( + rhs.data(), !rhs_scales.empty() ? rhs_scales.data() : nullptr, bias.data(), method.rhs_format, + method.packed_rhs_format, info.n, info.k, !method.rhs_transposed); + } KAI_ASSUME(method.lhs_format.is_raw()); KAI_ASSUME(method.rhs_format.is_raw()); @@ -264,6 +357,7 @@ protected: auto ref_dst = matmul( lhs.data(), nullptr, nullptr, method.lhs_format.data_type(), // rhs.data(), rhs_scales.data(), nullptr, method.rhs_format.data_type(), // + bias.data(), nullptr, nullptr, method.bias_format.data_type(), // method.dst_format.data_type(), // info.m, info.n, info.k, method.lhs_transposed, method.rhs_transposed); @@ -272,6 +366,7 @@ protected: .ref_packed_lhs = std::move(ref_packed_lhs), .rhs = std::move(rhs), .rhs_scales = std::move(rhs_scales), + .bias = std::move(bias), .ref_packed_rhs = std::move(packed_rhs), .ref_dst = std::move(ref_dst), }; @@ -287,11 +382,11 @@ std::map MatMulTest::_data; /// Tests the LHS packing kernel. TEST_P(MatMulTest, PackedLhs) { - const auto& [info, method_no, portion] = GetParam(); + const auto& [method_no, info, portion] = GetParam(); const auto& data = test_data(); const auto& method = matmul_methods.at(method_no); - if (method.fn_pack_lhs == nullptr) { + if (!method.is_pack_lhs_needed()) { GTEST_SKIP(); } @@ -312,7 +407,7 @@ TEST_P(MatMulTest, PackedLhs) { const auto ref_packed_lhs_size = method.packed_lhs_format.default_size_in_bytes(lhs_h, lhs_w); ASSERT_EQ(packed_lhs_size, ref_packed_lhs_size); - const auto lhs_offset = method.fn_get_lhs_offset(rect.start_row(), ref_lhs_row_stride); + const auto lhs_offset = method.fn_get_lhs_offset(rect.start_row(), 0, ref_lhs_row_stride); const auto ref_lhs_offset = method.lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w); ASSERT_EQ(lhs_offset, ref_lhs_offset); @@ -334,7 +429,7 @@ TEST_P(MatMulTest, PackedLhs) { /// Tests the RHS packing kernel. TEST_P(MatMulTest, PackedRhs) { - const auto& [info, method_no, portion] = GetParam(); + const auto& [method_no, info, portion] = GetParam(); const auto& data = test_data(); const auto& method = matmul_methods.at(method_no); @@ -342,58 +437,65 @@ TEST_P(MatMulTest, PackedRhs) { GTEST_SKIP(); } - const auto rhs_h = method.rhs_transposed ? info.n : info.k; const auto rhs_w = method.rhs_transposed ? info.k : info.n; + const auto packed_rhs_h = info.n; + const auto packed_rhs_w = info.k; const auto rect = portion.compute_portion( - rhs_h, rhs_w, method.packed_rhs_format.scheduler_block_height(rhs_h), - method.packed_rhs_format.scheduler_block_width(rhs_w)); + packed_rhs_h, packed_rhs_w, method.packed_rhs_format.scheduler_block_height(packed_rhs_h), + method.packed_rhs_format.scheduler_block_width(packed_rhs_w)); if (rect.height() == 0 || rect.width() == 0) { GTEST_SKIP(); } + const auto rhs_start_row = method.rhs_transposed ? rect.start_row() : rect.start_col(); + const auto rhs_start_col = method.rhs_transposed ? rect.start_col() : rect.start_row(); + const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w); - const auto rhs_offset = method.fn_get_rhs_offset(rect.start_row(), ref_rhs_row_stride); - const auto ref_rhs_offset = method.rhs_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), rhs_w); + const auto rhs_offset = method.fn_get_rhs_offset(rect.start_row(), rect.start_col(), ref_rhs_row_stride); + const auto ref_rhs_offset = method.rhs_format.default_offset_in_bytes(rhs_start_row, rhs_start_col, rhs_w); ASSERT_EQ(rhs_offset, ref_rhs_offset); - const auto packed_rhs_size = method.fn_get_packed_rhs_size( - rhs_h, rhs_w, method.packed_rhs_format.block_height(), method.packed_rhs_format.block_width()); - const auto ref_packed_rhs_size = method.packed_rhs_format.default_size_in_bytes(rhs_h, rhs_w); + const auto packed_rhs_size = method.fn_get_packed_rhs_size(packed_rhs_h, packed_rhs_w); + const auto ref_packed_rhs_size = method.packed_rhs_format.default_size_in_bytes(packed_rhs_h, packed_rhs_w); ASSERT_EQ(packed_rhs_size, ref_packed_rhs_size); - const auto packed_rhs_offset = method.fn_get_packed_rhs_offset( - rect.start_row(), rhs_w, method.packed_rhs_format.block_height(), method.packed_rhs_format.block_width()); + const auto packed_rhs_offset = method.fn_get_packed_rhs_offset(info.k, rect.start_row(), rect.start_col()); const auto ref_packed_rhs_offset = - method.packed_rhs_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), rhs_w); + method.packed_rhs_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), packed_rhs_w); ASSERT_EQ(packed_rhs_offset, ref_packed_rhs_offset); const auto ref_rhs_scales_offset = rect.start_row() * data_type_size_in_bits(method.packed_rhs_format.scale_data_type()) / 8; + const auto ref_bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), info.n); + std::vector packed_rhs; packed_rhs.resize(packed_rhs_size); method.pack_rhs( - rect.height(), rect.width(), data.rhs.data() + rhs_offset, ref_rhs_row_stride, nullptr, + rect.height(), rect.width(), data.rhs.data() + rhs_offset, ref_rhs_row_stride, + data.bias.data() + ref_bias_offset, !data.rhs_scales.empty() ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr, packed_rhs.data() + packed_rhs_offset); - DefaultMismatchHandler handler(0, 0.0001, 0, 0.001); - const auto success = - compare(packed_rhs.data(), data.ref_packed_rhs.data(), method.packed_rhs_format, rhs_h, rhs_w, rect, handler); + const auto exact = method.packed_rhs_format.pack_format() != DataFormat::PackFormat::QUANTIZE_PER_ROW; + DefaultMismatchHandler handler(0, exact ? 0 : 0.0001, 0, exact ? 0 : 0.001); + const auto success = compare( + packed_rhs.data(), data.ref_packed_rhs.data(), method.packed_rhs_format, packed_rhs_h, packed_rhs_w, rect, + handler); ASSERT_TRUE(success); } /// Tests the output. TEST_P(MatMulTest, Output) { - const auto& [info, method_no, portion] = GetParam(); + const auto& [method_no, info, portion] = GetParam(); const auto& data = test_data(); const auto& method = matmul_methods.at(method_no); - if (method.fn_main == nullptr) { + if (!method.has_main_kernel()) { GTEST_SKIP(); } @@ -403,25 +505,54 @@ TEST_P(MatMulTest, Output) { GTEST_SKIP(); } - const auto ref_dst_row_stride = method.dst_format.default_row_stride(info.n); - const auto ref_dst_col_stride = data_type_size_in_bits(method.dst_format.data_type()) / 8; + const auto lhs_w = method.lhs_transposed ? info.m : info.k; + const auto rhs_w = method.rhs_transposed ? info.k : info.n; + const auto bias_w = info.n; + const auto dst_w = info.n; + + const auto* lhs_data = data.lhs.data(); + const auto lhs_start_row = method.lhs_transposed ? 0 : rect.start_row(); + const auto lhs_start_col = method.lhs_transposed ? rect.start_row() : 0; + auto lhs_offset = method.lhs_format.default_offset_in_bytes(lhs_start_row, lhs_start_col, lhs_w); + const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w); + + if (method.is_pack_lhs_needed()) { + lhs_data = data.ref_packed_lhs.data(); + lhs_offset = method.packed_lhs_format.default_offset_in_bytes(lhs_start_row, lhs_start_col, info.k); + } + + const auto rhs_stride = method.rhs_format.default_row_stride(rhs_w); + + const uint8_t* rhs_data = nullptr; + uintptr_t rhs_offset = 0; - const auto ref_packed_lhs_offset = method.packed_lhs_format.default_offset_in_bytes( - method.lhs_transposed ? 0 : rect.start_row(), method.lhs_transposed ? rect.start_row() : 0, - method.lhs_transposed ? info.m : info.k); - const auto ref_packed_rhs_offset = method.packed_rhs_format.default_offset_in_bytes( - method.rhs_transposed ? rect.start_col() : 0, method.rhs_transposed ? 0 : rect.start_col(), - method.rhs_transposed ? info.k : info.n); - const auto ref_dst_offset = method.dst_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), info.n); + if (method.is_pack_rhs_needed()) { + const auto packed_rhs_start_row = rect.start_col(); + const auto packed_rhs_start_col = 0; + rhs_data = data.ref_packed_rhs.data(); + rhs_offset = + method.packed_rhs_format.default_offset_in_bytes(packed_rhs_start_row, packed_rhs_start_col, info.k); + } else { + const auto rhs_start_row = method.rhs_transposed ? rect.start_col() : 0; + const auto rhs_start_col = method.rhs_transposed ? 0 : rect.start_col(); + + rhs_data = data.rhs.data(); + rhs_offset = method.rhs_format.default_offset_in_bytes(rhs_start_row, rhs_start_col, rhs_w); + } + + const auto* bias_data = data.bias.data(); + const auto bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), bias_w); + + const auto dst_offset = method.dst_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), dst_w); + const auto dst_stride = method.dst_format.default_row_stride(dst_w); std::vector dst; dst.resize(method.dst_format.default_size_in_bytes(info.m, info.n)); - method.fn_main( - rect.height(), rect.width(), info.k, data.ref_packed_lhs.data() + ref_packed_lhs_offset, - data.ref_packed_rhs.data() + ref_packed_rhs_offset, reinterpret_cast(dst.data() + ref_dst_offset), - ref_dst_row_stride, ref_dst_col_stride, std::numeric_limits::lowest(), - std::numeric_limits::max()); + method.main_kernel( + rect.height(), rect.width(), info.k, lhs_data + lhs_offset, rhs_data + rhs_offset, bias_data + bias_offset, + dst.data() + dst_offset, lhs_stride, rhs_stride, dst_stride, -std::numeric_limits::infinity(), + std::numeric_limits::infinity()); DefaultMismatchHandler handler(0, 0.1, 0, 0.05); const auto success = compare(dst.data(), data.ref_dst.data(), method.dst_format, info.m, info.n, rect, handler); @@ -431,10 +562,12 @@ TEST_P(MatMulTest, Output) { INSTANTIATE_TEST_SUITE_P( MatMul, MatMulTest, testing::Combine( - testing::Values( - MatMulShape{4, 4, 32}, // - MatMulShape{12, 16, 64}), testing::Range(0, matmul_methods.size()), + testing::Values( + MatMulShape{6, 16, 32}, // + MatMulShape{12, 32, 17}, // + MatMulShape{13, 33, 23} // + ), testing::Values( MatrixPortion(0, 0, 1, 1), // Full matrix. MatrixPortion(0, 0, 0.25, 0.25), // Top-left corner. -- GitLab From 6253af657896fa0944f5071d4dd05398f0cf2bb8 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 28 May 2024 16:19:07 +0100 Subject: [PATCH 2/3] Fix CI issues and remove BF16 kernels * Remove BF16 kernels from the PR. * Fix compilation issues in CI. Signed-off-by: Viet-Hoa Do --- CMakeLists.txt | 13 +- ...mp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c | 3704 ----------------- ...mp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h | 111 - ...ack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c | 498 --- ...ack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h | 90 - test/common/bfloat16.hpp | 30 +- test/common/compare.cpp | 6 +- test/common/float16.hpp | 2 +- test/reference/matmul.cpp | 3 +- test/reference/quantize.cpp | 2 +- test/reference/transpose.cpp | 1 + test/tests/matmul_test.cpp | 44 +- 12 files changed, 43 insertions(+), 4461 deletions(-) delete mode 100644 src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c delete mode 100644 src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h delete mode 100644 src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c delete mode 100644 src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index fd7d79c5..7d4a7744 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,11 +56,13 @@ set(KLEIDIAI_WARNING_FLAGS $<$:${KLEIDIAI_WARNING_FLAGS_CXX}> ) -add_library(kleidiai +set(KLEIDIAI_FILES_NEON_FP16 src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c - src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c - src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c +) + +add_library(kleidiai + ${KLEIDIAI_FILES_NEON_FP16} ) target_include_directories(kleidiai @@ -72,6 +74,10 @@ target_compile_options(kleidiai PRIVATE ${KLEIDIAI_WARNING_FLAGS} ) +foreach(KLEIDIAI_SOURCE_FILE IN LISTS KLEIDIAI_FILES_NEON_FP16) + set_property(SOURCE ${KLEIDIAI_SOURCE_FILE} PROPERTY COMPILE_OPTIONS -march=armv8-a+fp16) +endforeach() + if(KLEIDIAI_BUILD_TESTS) enable_testing() include(GoogleTest) @@ -106,6 +112,7 @@ if(KLEIDIAI_BUILD_TESTS) target_compile_options(kleidiai_test PRIVATE ${KLEIDIAI_WARNING_FLAGS} + PRIVATE -march=armv8-a+fp16+bf16 ) target_link_libraries(kleidiai_test diff --git a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c deleted file mode 100644 index ea63cffa..00000000 --- a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.c +++ /dev/null @@ -1,3704 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include - -typedef bfloat16_t bfloat16; - -#include "kai_common.h" - -static const size_t block_height = 6; -static const size_t block_width = 16; - -size_t kai_get_m_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m) { - KAI_UNUSED(m); - - return 6; -} - -size_t kai_get_n_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n) { - KAI_UNUSED(n); - - return 16; -} - -size_t kai_get_lhs_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t k) { - return k * sizeof(bfloat16); -} - -size_t kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t m_idx, size_t k_idx, size_t stride) { - KAI_ASSUME(m_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); - - return m_idx * stride; -} - -size_t kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t k, size_t n_idx, size_t k_idx) { - KAI_ASSUME(n_idx % block_width == 0); - KAI_ASSUME(k_idx == 0); - - return n_idx / block_width * (block_width * sizeof(float) + block_width * k * sizeof(bfloat16)); -} - -size_t kai_get_dst_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n) { - return n * sizeof(float); -} - -size_t kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t m_idx, size_t n_idx, size_t stride) { - KAI_ASSUME(m_idx % block_height == 0); - KAI_ASSUME(n_idx % block_width == 0); - - return m_idx * stride + n_idx * sizeof(float); -} - -size_t kai_get_dst_size_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m, size_t n, size_t stride) { - return m * stride + n * sizeof(float); -} - -void kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t m, size_t n, size_t k, // - const void* lhs, const void* packed_rhs, void* dst, // - size_t lhs_stride, size_t dst_stride, // - float clamp_min, float clamp_max) { - typedef struct { - float maxval; - float minval; - unsigned int num_strings; - const unsigned int* string_lengths; - size_t N; - const void* B_ptr; - size_t output_offset; - size_t input_initial_col; - size_t input_offset; - void* output_ptr; - const void* bias; - } KernelArgs; - - KernelArgs ka; - - unsigned long flags = 0; - - unsigned int string_length = k; - ka.num_strings = 1; - ka.string_lengths = &string_length; - ka.N = n; - ka.B_ptr = packed_rhs; - ka.bias = NULL; - - // Direct input. - const void* input_ptr = lhs; - ka.input_offset = lhs_stride / sizeof(bfloat16); - ka.input_initial_col = 0; - - // Direct output. - ka.output_ptr = dst; - ka.output_offset = dst_stride / sizeof(float); - - // Clamping output. - flags |= 0x2; - ka.maxval = clamp_max; - ka.minval = clamp_min; - - __asm__ __volatile__( - "1:" // Row loop - "cmp %x[m], #0x6\n" - "bge 186f\n" - "cmp %x[m], #0x4\n" - "bgt 149f\n" - "beq 112f\n" - "cmp %x[m], #0x2\n" - "bgt 75f\n" - "beq 38f\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "2:" // Height 1: Column loop - "cbz x10, 3f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "b 15f\n" - "3:" // Height 1: no bias - "tbz %x[flags], #0, 14f\n" - "cmp x11, #0x10\n" - "bge 12f\n" - "tbz x11, #3, 7f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "tbz x11, #2, 5f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "tbz x11, #1, 4f\n" - "ldr d16, [x9], #0x8\n" - "mov x20, #0x38\n" - "tbz x11, #0, 11f\n" - "ld1 { v16.s }[2], [x9]\n" - "b 11f\n" - "4:" // Height 1: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 11f\n" - "ldr s16, [x9, #0x0]\n" - "b 11f\n" - "5:" // Height 1: Partial accumulate: partial_2_8 - "tbz x11, #1, 6f\n" - "ldr d11, [x9], #0x8\n" - "mov x20, #0x28\n" - "tbz x11, #0, 11f\n" - "ld1 { v11.s }[2], [x9]\n" - "b 11f\n" - "6:" // Height 1: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 11f\n" - "ldr s11, [x9, #0x0]\n" - "b 11f\n" - "7:" // Height 1: Partial accumulate: partial_4_0 - "tbz x11, #2, 9f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "tbz x11, #1, 8f\n" - "ldr d10, [x9], #0x8\n" - "mov x20, #0x18\n" - "tbz x11, #0, 11f\n" - "ld1 { v10.s }[2], [x9]\n" - "b 11f\n" - "8:" // Height 1: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 11f\n" - "ldr s10, [x9, #0x0]\n" - "b 11f\n" - "9:" // Height 1: Partial accumulate: partial_2_0 - "tbz x11, #1, 10f\n" - "ldr d9, [x9], #0x8\n" - "mov x20, #0x8\n" - "tbz x11, #0, 11f\n" - "ld1 { v9.s }[2], [x9]\n" - "b 11f\n" - "10:" // Height 1: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "mov x20, #0x0\n" - "11:" // Height 1: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 13f\n" - "12:" // Height 1: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "13:" // Height 1: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "b 15f\n" - "14:" // Height 1: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "15:" // Height 1: setup done - "mov x28, #0x0\n" - "16:" // Height 1: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 17f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 18f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "b 18f\n" - "17:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "18:" // Height 1: input setup done - "cmp x27, #0x8\n" - "blt 21f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q6, [x10, #0x10]\n" - "blt 20f\n" - "19:" // Height 1: Multiply loop: Main loop head - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "bge 19b\n" - "20:" // Height 1: Multiply loop: Single iteration only - "add x26, x26, #0x10\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x26, #0x80]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "21:" // Height 1: Multiply loop: Main loop skip - "cbz x27, 26f\n" - "cmp x27, #0x4\n" - "blt 23f\n" - "22:" // Height 1: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "sub x27, x27, #0x4\n" - "ldr q7, [x10, #0x10]\n" - "cmp x27, #0x4\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "bge 22b\n" - "23:" // Height 1: Multiply loop: Skip odd blocks - "cbz x27, 26f\n" - "tbz x27, #1, 24f\n" - "ldr s1, [x26], #0x4\n" - "tbz x27, #0, 25f\n" - "ld1 { v1.h }[2], [x26]\n" - "b 25f\n" - "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "25:" // Height 1: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "26:" // Height 1: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 16b\n" - "uzp1 v8.2d, v8.2d, v12.2d\n" - "uzp1 v9.2d, v9.2d, v13.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v10.2d, v10.2d, v14.2d\n" - "uzp1 v11.2d, v11.2d, v15.2d\n" - "tbz %x[flags], #1, 27f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" - "fmin v8.4s, v8.4s, v18.4s\n" - "fmin v9.4s, v9.4s, v18.4s\n" - "fmin v10.4s, v10.4s, v18.4s\n" - "fmin v11.4s, v11.4s, v18.4s\n" - "fmax v8.4s, v8.4s, v17.4s\n" - "fmax v9.4s, v9.4s, v17.4s\n" - "fmax v10.4s, v10.4s, v17.4s\n" - "fmax v11.4s, v11.4s, v17.4s\n" - "27:" // Height 1: No activation - "cmp x11, #0x10\n" - "bge 36f\n" - "tbz x11, #3, 31f\n" - "st1 { v8.4s }, [x9], #0x10\n" - "st1 { v9.4s }, [x9], #0x10\n" - "tbz x11, #2, 29f\n" - "st1 { v10.4s }, [x9], #0x10\n" - "tbz x11, #1, 28f\n" - "str d11, [x9], #0x8\n" - "tbz x11, #0, 35f\n" - "st1 { v11.s }[2], [x9]\n" - "b 35f\n" - "28:" // Height 1: Partial direct writeback: partial_1_12 - "tbz x11, #0, 35f\n" - "str s11, [x9, #0x0]\n" - "b 35f\n" - "29:" // Height 1: Partial direct writeback: partial_2_8 - "tbz x11, #1, 30f\n" - "str d10, [x9], #0x8\n" - "tbz x11, #0, 35f\n" - "st1 { v10.s }[2], [x9]\n" - "b 35f\n" - "30:" // Height 1: Partial direct writeback: partial_1_8 - "tbz x11, #0, 35f\n" - "str s10, [x9, #0x0]\n" - "b 35f\n" - "31:" // Height 1: Partial direct writeback: partial_4_0 - "tbz x11, #2, 33f\n" - "st1 { v8.4s }, [x9], #0x10\n" - "tbz x11, #1, 32f\n" - "str d9, [x9], #0x8\n" - "tbz x11, #0, 35f\n" - "st1 { v9.s }[2], [x9]\n" - "b 35f\n" - "32:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x11, #0, 35f\n" - "str s9, [x9, #0x0]\n" - "b 35f\n" - "33:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x11, #1, 34f\n" - "str d8, [x9], #0x8\n" - "tbz x11, #0, 35f\n" - "st1 { v8.s }[2], [x9]\n" - "b 35f\n" - "34:" // Height 1: Partial direct writeback: partial_1_0 - "str s8, [x9, #0x0]\n" - "35:" // Height 1: Partial direct writeback: Done - "b 37f\n" - "36:" // Height 1: Full writeback - "str q8, [x9, #0x0]\n" - "str q9, [x9, #0x10]\n" - "str q10, [x9, #0x20]\n" - "str q11, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "37:" // Height 1: Writeback done - "subs x11, x11, #0x10\n" - "bgt 2b\n" - "b 224f\n" - "38:" // Height 2 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "39:" // Height 2: Column loop - "cbz x10, 40f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "b 52f\n" - "40:" // Height 2: no bias - "tbz %x[flags], #0, 51f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x10\n" - "add x26, x9, x20, LSL #2\n" - "bge 49f\n" - "tbz x11, #3, 44f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "ld1 { v13.4s }, [x26], #0x10\n" - "tbz x11, #2, 42f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "ld1 { v14.4s }, [x26], #0x10\n" - "tbz x11, #1, 41f\n" - "ldr d16, [x9], #0x8\n" - "ldr d15, [x26], #0x8\n" - "mov x20, #0x38\n" - "tbz x11, #0, 48f\n" - "ld1 { v16.s }[2], [x9]\n" - "ld1 { v15.s }[2], [x26]\n" - "b 48f\n" - "41:" // Height 2: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 48f\n" - "ldr s16, [x9, #0x0]\n" - "ldr s15, [x26, #0x0]\n" - "b 48f\n" - "42:" // Height 2: Partial accumulate: partial_2_8 - "tbz x11, #1, 43f\n" - "ldr d11, [x9], #0x8\n" - "ldr d14, [x26], #0x8\n" - "mov x20, #0x28\n" - "tbz x11, #0, 48f\n" - "ld1 { v11.s }[2], [x9]\n" - "ld1 { v14.s }[2], [x26]\n" - "b 48f\n" - "43:" // Height 2: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 48f\n" - "ldr s11, [x9, #0x0]\n" - "ldr s14, [x26, #0x0]\n" - "b 48f\n" - "44:" // Height 2: Partial accumulate: partial_4_0 - "tbz x11, #2, 46f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "tbz x11, #1, 45f\n" - "ldr d10, [x9], #0x8\n" - "ldr d13, [x26], #0x8\n" - "mov x20, #0x18\n" - "tbz x11, #0, 48f\n" - "ld1 { v10.s }[2], [x9]\n" - "ld1 { v13.s }[2], [x26]\n" - "b 48f\n" - "45:" // Height 2: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 48f\n" - "ldr s10, [x9, #0x0]\n" - "ldr s13, [x26, #0x0]\n" - "b 48f\n" - "46:" // Height 2: Partial accumulate: partial_2_0 - "tbz x11, #1, 47f\n" - "ldr d9, [x9], #0x8\n" - "ldr d12, [x26], #0x8\n" - "mov x20, #0x8\n" - "tbz x11, #0, 48f\n" - "ld1 { v9.s }[2], [x9]\n" - "ld1 { v12.s }[2], [x26]\n" - "b 48f\n" - "47:" // Height 2: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "ldr s12, [x26, #0x0]\n" - "mov x20, #0x0\n" - "48:" // Height 2: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 50f\n" - "49:" // Height 2: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "ldr q12, [x26, #0x0]\n" - "ldr q13, [x26, #0x10]\n" - "ldr q14, [x26, #0x20]\n" - "ldr q15, [x26, #0x30]\n" - "50:" // Height 2: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "b 52f\n" - "51:" // Height 2: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "52:" // Height 2: setup done - "mov x28, #0x0\n" - "53:" // Height 2: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 54f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "cbnz x28, 55f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "b 55f\n" - "54:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #1\n" - "55:" // Height 2: input setup done - "cmp x27, #0x8\n" - "blt 58f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q2, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "blt 57f\n" - "56:" // Height 2: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q2, [x25, #0x0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "bge 56b\n" - "57:" // Height 2: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "58:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 63f\n" - "cmp x27, #0x4\n" - "blt 60f\n" - "59:" // Height 2: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "sub x27, x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "cmp x27, #0x4\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - "bge 59b\n" - "60:" // Height 2: Multiply loop: Skip odd blocks - "cbz x27, 63f\n" - "tbz x27, #1, 61f\n" - "ldr s1, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "tbz x27, #0, 62f\n" - "ld1 { v1.h }[2], [x26]\n" - "ld1 { v2.h }[2], [x25]\n" - "b 62f\n" - "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "ldr h2, [x25, #0x0]\n" - "62:" // Height 2: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - "63:" // Height 2: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 53b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" - "uzp2 v8.2d, v8.2d, v12.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v12.2d, v9.2d, v13.2d\n" - "uzp2 v9.2d, v9.2d, v13.2d\n" - "uzp1 v13.2d, v10.2d, v14.2d\n" - "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x26, x9, x20, LSL #2\n" - "uzp1 v14.2d, v11.2d, v15.2d\n" - "uzp2 v11.2d, v11.2d, v15.2d\n" - "prfm pstl1keep, [x26, #0x0]\n" - "tbz %x[flags], #1, 64f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v18.4s }, [x21]\n" - "ld1r { v17.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v18.4s\n" - "fmin v12.4s, v12.4s, v18.4s\n" - "fmin v13.4s, v13.4s, v18.4s\n" - "fmin v14.4s, v14.4s, v18.4s\n" - "fmin v8.4s, v8.4s, v18.4s\n" - "fmin v9.4s, v9.4s, v18.4s\n" - "fmin v10.4s, v10.4s, v18.4s\n" - "fmin v11.4s, v11.4s, v18.4s\n" - "fmax v7.4s, v7.4s, v17.4s\n" - "fmax v12.4s, v12.4s, v17.4s\n" - "fmax v13.4s, v13.4s, v17.4s\n" - "fmax v14.4s, v14.4s, v17.4s\n" - "fmax v8.4s, v8.4s, v17.4s\n" - "fmax v9.4s, v9.4s, v17.4s\n" - "fmax v10.4s, v10.4s, v17.4s\n" - "fmax v11.4s, v11.4s, v17.4s\n" - "64:" // Height 2: No activation - "cmp x11, #0x10\n" - "bge 73f\n" - "tbz x11, #3, 68f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v9.4s }, [x26], #0x10\n" - "tbz x11, #2, 66f\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v10.4s }, [x26], #0x10\n" - "tbz x11, #1, 65f\n" - "str d14, [x9], #0x8\n" - "str d11, [x26], #0x8\n" - "tbz x11, #0, 72f\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v11.s }[2], [x26]\n" - "b 72f\n" - "65:" // Height 2: Partial direct writeback: partial_1_12 - "tbz x11, #0, 72f\n" - "str s14, [x9, #0x0]\n" - "str s11, [x26, #0x0]\n" - "b 72f\n" - "66:" // Height 2: Partial direct writeback: partial_2_8 - "tbz x11, #1, 67f\n" - "str d13, [x9], #0x8\n" - "str d10, [x26], #0x8\n" - "tbz x11, #0, 72f\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v10.s }[2], [x26]\n" - "b 72f\n" - "67:" // Height 2: Partial direct writeback: partial_1_8 - "tbz x11, #0, 72f\n" - "str s13, [x9, #0x0]\n" - "str s10, [x26, #0x0]\n" - "b 72f\n" - "68:" // Height 2: Partial direct writeback: partial_4_0 - "tbz x11, #2, 70f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "tbz x11, #1, 69f\n" - "str d12, [x9], #0x8\n" - "str d9, [x26], #0x8\n" - "tbz x11, #0, 72f\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v9.s }[2], [x26]\n" - "b 72f\n" - "69:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x11, #0, 72f\n" - "str s12, [x9, #0x0]\n" - "str s9, [x26, #0x0]\n" - "b 72f\n" - "70:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x11, #1, 71f\n" - "str d7, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "tbz x11, #0, 72f\n" - "st1 { v7.s }[2], [x9]\n" - "st1 { v8.s }[2], [x26]\n" - "b 72f\n" - "71:" // Height 2: Partial direct writeback: partial_1_0 - "str s7, [x9, #0x0]\n" - "str s8, [x26, #0x0]\n" - "72:" // Height 2: Partial direct writeback: Done - "b 74f\n" - "73:" // Height 2: Full writeback - "str q7, [x9, #0x0]\n" - "str q12, [x9, #0x10]\n" - "str q13, [x9, #0x20]\n" - "str q14, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "str q8, [x26, #0x0]\n" - "str q9, [x26, #0x10]\n" - "str q10, [x26, #0x20]\n" - "str q11, [x26, #0x30]\n" - "74:" // Height 2: Writeback done - "subs x11, x11, #0x10\n" - "bgt 39b\n" - "b 224f\n" - "75:" // Height 3 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "76:" // Height 3: Column loop - "cbz x10, 77f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "mov v16.16b, v8.16b\n" - "mov v20.16b, v12.16b\n" - "mov v17.16b, v9.16b\n" - "mov v21.16b, v13.16b\n" - "mov v18.16b, v10.16b\n" - "mov v22.16b, v14.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v15.16b\n" - "b 89f\n" - "77:" // Height 3: no bias - "tbz %x[flags], #0, 88f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x10\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "bge 86f\n" - "tbz x11, #3, 81f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "ld1 { v13.4s }, [x26], #0x10\n" - "ld1 { v18.4s }, [x25], #0x10\n" - "tbz x11, #2, 79f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "ld1 { v14.4s }, [x26], #0x10\n" - "ld1 { v19.4s }, [x25], #0x10\n" - "tbz x11, #1, 78f\n" - "ldr d16, [x9], #0x8\n" - "ldr d15, [x26], #0x8\n" - "mov x20, #0x38\n" - "ldr d24, [x25], #0x8\n" - "tbz x11, #0, 85f\n" - "ld1 { v16.s }[2], [x9]\n" - "ld1 { v15.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "b 85f\n" - "78:" // Height 3: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 85f\n" - "ldr s16, [x9, #0x0]\n" - "ldr s15, [x26, #0x0]\n" - "ldr s24, [x25, #0x0]\n" - "b 85f\n" - "79:" // Height 3: Partial accumulate: partial_2_8 - "tbz x11, #1, 80f\n" - "ldr d11, [x9], #0x8\n" - "ldr d14, [x26], #0x8\n" - "mov x20, #0x28\n" - "ldr d19, [x25], #0x8\n" - "tbz x11, #0, 85f\n" - "ld1 { v11.s }[2], [x9]\n" - "ld1 { v14.s }[2], [x26]\n" - "ld1 { v19.s }[2], [x25]\n" - "b 85f\n" - "80:" // Height 3: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 85f\n" - "ldr s11, [x9, #0x0]\n" - "ldr s14, [x26, #0x0]\n" - "ldr s19, [x25, #0x0]\n" - "b 85f\n" - "81:" // Height 3: Partial accumulate: partial_4_0 - "tbz x11, #2, 83f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "tbz x11, #1, 82f\n" - "ldr d10, [x9], #0x8\n" - "ldr d13, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d18, [x25], #0x8\n" - "tbz x11, #0, 85f\n" - "ld1 { v10.s }[2], [x9]\n" - "ld1 { v13.s }[2], [x26]\n" - "ld1 { v18.s }[2], [x25]\n" - "b 85f\n" - "82:" // Height 3: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 85f\n" - "ldr s10, [x9, #0x0]\n" - "ldr s13, [x26, #0x0]\n" - "ldr s18, [x25, #0x0]\n" - "b 85f\n" - "83:" // Height 3: Partial accumulate: partial_2_0 - "tbz x11, #1, 84f\n" - "ldr d9, [x9], #0x8\n" - "ldr d12, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d17, [x25], #0x8\n" - "tbz x11, #0, 85f\n" - "ld1 { v9.s }[2], [x9]\n" - "ld1 { v12.s }[2], [x26]\n" - "ld1 { v17.s }[2], [x25]\n" - "b 85f\n" - "84:" // Height 3: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "ldr s12, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s17, [x25, #0x0]\n" - "85:" // Height 3: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 87f\n" - "86:" // Height 3: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "ldr q12, [x26, #0x0]\n" - "ldr q13, [x26, #0x10]\n" - "ldr q14, [x26, #0x20]\n" - "ldr q15, [x26, #0x30]\n" - "ldr q17, [x25, #0x0]\n" - "ldr q18, [x25, #0x10]\n" - "ldr q19, [x25, #0x20]\n" - "ldr q24, [x25, #0x30]\n" - "87:" // Height 3: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "zip1 v16.2d, v17.2d, v20.2d\n" - "zip2 v20.2d, v17.2d, v20.2d\n" - "zip1 v17.2d, v18.2d, v21.2d\n" - "zip2 v21.2d, v18.2d, v21.2d\n" - "zip1 v18.2d, v19.2d, v22.2d\n" - "zip2 v22.2d, v19.2d, v22.2d\n" - "zip1 v19.2d, v24.2d, v23.2d\n" - "zip2 v23.2d, v24.2d, v23.2d\n" - "b 89f\n" - "88:" // Height 3: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "89:" // Height 3: setup done - "mov x28, #0x0\n" - "90:" // Height 3: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 91f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "cbnz x28, 92f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "b 92f\n" - "91:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #1\n" - "add x24, x25, x21, LSL #1\n" - "92:" // Height 3: input setup done - "cmp x27, #0x8\n" - "blt 95f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q2, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q3, [x24, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "blt 94f\n" - "93:" // Height 3: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "prfm pldl1keep, [x24, #0x80]\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "bge 93b\n" - "94:" // Height 3: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "95:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 100f\n" - "cmp x27, #0x4\n" - "blt 97f\n" - "96:" // Height 3: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "cmp x27, #0x4\n" - "ldr q7, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - "bge 96b\n" - "97:" // Height 3: Multiply loop: Skip odd blocks - "cbz x27, 100f\n" - "tbz x27, #1, 98f\n" - "ldr s1, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "ldr s3, [x24], #0x4\n" - "tbz x27, #0, 99f\n" - "ld1 { v1.h }[2], [x26]\n" - "ld1 { v2.h }[2], [x25]\n" - "ld1 { v3.h }[2], [x24]\n" - "b 99f\n" - "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "ldr h2, [x25, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "99:" // Height 3: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "100:" // Height 3: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 90b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" - "uzp2 v8.2d, v8.2d, v12.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v12.2d, v9.2d, v13.2d\n" - "uzp2 v9.2d, v9.2d, v13.2d\n" - "uzp1 v13.2d, v10.2d, v14.2d\n" - "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "uzp1 v14.2d, v11.2d, v15.2d\n" - "uzp2 v11.2d, v11.2d, v15.2d\n" - "prfm pstl1keep, [x26, #0x0]\n" - "prfm pstl1keep, [x25, #0x0]\n" - "uzp1 v16.2d, v16.2d, v20.2d\n" - "uzp1 v17.2d, v17.2d, v21.2d\n" - "uzp1 v18.2d, v18.2d, v22.2d\n" - "uzp1 v19.2d, v19.2d, v23.2d\n" - "tbz %x[flags], #1, 101f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v26.4s\n" - "fmin v12.4s, v12.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v26.4s\n" - "fmin v14.4s, v14.4s, v26.4s\n" - "fmin v8.4s, v8.4s, v26.4s\n" - "fmin v9.4s, v9.4s, v26.4s\n" - "fmin v10.4s, v10.4s, v26.4s\n" - "fmin v11.4s, v11.4s, v26.4s\n" - "fmin v16.4s, v16.4s, v26.4s\n" - "fmin v17.4s, v17.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v26.4s\n" - "fmin v19.4s, v19.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v25.4s\n" - "fmax v12.4s, v12.4s, v25.4s\n" - "fmax v13.4s, v13.4s, v25.4s\n" - "fmax v14.4s, v14.4s, v25.4s\n" - "fmax v8.4s, v8.4s, v25.4s\n" - "fmax v9.4s, v9.4s, v25.4s\n" - "fmax v10.4s, v10.4s, v25.4s\n" - "fmax v11.4s, v11.4s, v25.4s\n" - "fmax v16.4s, v16.4s, v25.4s\n" - "fmax v17.4s, v17.4s, v25.4s\n" - "fmax v18.4s, v18.4s, v25.4s\n" - "fmax v19.4s, v19.4s, v25.4s\n" - "101:" // Height 3: No activation - "cmp x11, #0x10\n" - "bge 110f\n" - "tbz x11, #3, 105f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v9.4s }, [x26], #0x10\n" - "st1 { v16.4s }, [x25], #0x10\n" - "st1 { v17.4s }, [x25], #0x10\n" - "tbz x11, #2, 103f\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v10.4s }, [x26], #0x10\n" - "st1 { v18.4s }, [x25], #0x10\n" - "tbz x11, #1, 102f\n" - "str d14, [x9], #0x8\n" - "str d11, [x26], #0x8\n" - "str d19, [x25], #0x8\n" - "tbz x11, #0, 109f\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v11.s }[2], [x26]\n" - "st1 { v19.s }[2], [x25]\n" - "b 109f\n" - "102:" // Height 3: Partial direct writeback: partial_1_12 - "tbz x11, #0, 109f\n" - "str s14, [x9, #0x0]\n" - "str s11, [x26, #0x0]\n" - "str s19, [x25, #0x0]\n" - "b 109f\n" - "103:" // Height 3: Partial direct writeback: partial_2_8 - "tbz x11, #1, 104f\n" - "str d13, [x9], #0x8\n" - "str d10, [x26], #0x8\n" - "str d18, [x25], #0x8\n" - "tbz x11, #0, 109f\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v10.s }[2], [x26]\n" - "st1 { v18.s }[2], [x25]\n" - "b 109f\n" - "104:" // Height 3: Partial direct writeback: partial_1_8 - "tbz x11, #0, 109f\n" - "str s13, [x9, #0x0]\n" - "str s10, [x26, #0x0]\n" - "str s18, [x25, #0x0]\n" - "b 109f\n" - "105:" // Height 3: Partial direct writeback: partial_4_0 - "tbz x11, #2, 107f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v16.4s }, [x25], #0x10\n" - "tbz x11, #1, 106f\n" - "str d12, [x9], #0x8\n" - "str d9, [x26], #0x8\n" - "str d17, [x25], #0x8\n" - "tbz x11, #0, 109f\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v9.s }[2], [x26]\n" - "st1 { v17.s }[2], [x25]\n" - "b 109f\n" - "106:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x11, #0, 109f\n" - "str s12, [x9, #0x0]\n" - "str s9, [x26, #0x0]\n" - "str s17, [x25, #0x0]\n" - "b 109f\n" - "107:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x11, #1, 108f\n" - "str d7, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d16, [x25], #0x8\n" - "tbz x11, #0, 109f\n" - "st1 { v7.s }[2], [x9]\n" - "st1 { v8.s }[2], [x26]\n" - "st1 { v16.s }[2], [x25]\n" - "b 109f\n" - "108:" // Height 3: Partial direct writeback: partial_1_0 - "str s7, [x9, #0x0]\n" - "str s8, [x26, #0x0]\n" - "str s16, [x25, #0x0]\n" - "109:" // Height 3: Partial direct writeback: Done - "b 111f\n" - "110:" // Height 3: Full writeback - "str q7, [x9, #0x0]\n" - "str q12, [x9, #0x10]\n" - "str q13, [x9, #0x20]\n" - "str q14, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "str q8, [x26, #0x0]\n" - "str q9, [x26, #0x10]\n" - "str q10, [x26, #0x20]\n" - "str q11, [x26, #0x30]\n" - "str q16, [x25, #0x0]\n" - "str q17, [x25, #0x10]\n" - "str q18, [x25, #0x20]\n" - "str q19, [x25, #0x30]\n" - "111:" // Height 3: Writeback done - "subs x11, x11, #0x10\n" - "bgt 76b\n" - "b 224f\n" - "112:" // Height 4 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "113:" // Height 4: Column loop - "cbz x10, 114f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "mov v16.16b, v8.16b\n" - "mov v20.16b, v12.16b\n" - "mov v17.16b, v9.16b\n" - "mov v21.16b, v13.16b\n" - "mov v18.16b, v10.16b\n" - "mov v22.16b, v14.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v15.16b\n" - "b 126f\n" - "114:" // Height 4: no bias - "tbz %x[flags], #0, 125f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x10\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "bge 123f\n" - "tbz x11, #3, 118f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "ld1 { v13.4s }, [x26], #0x10\n" - "ld1 { v18.4s }, [x25], #0x10\n" - "ld1 { v21.4s }, [x24], #0x10\n" - "tbz x11, #2, 116f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "ld1 { v14.4s }, [x26], #0x10\n" - "ld1 { v19.4s }, [x25], #0x10\n" - "ld1 { v22.4s }, [x24], #0x10\n" - "tbz x11, #1, 115f\n" - "ldr d16, [x9], #0x8\n" - "ldr d15, [x26], #0x8\n" - "mov x20, #0x38\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "tbz x11, #0, 122f\n" - "ld1 { v16.s }[2], [x9]\n" - "ld1 { v15.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v23.s }[2], [x24]\n" - "b 122f\n" - "115:" // Height 4: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 122f\n" - "ldr s16, [x9, #0x0]\n" - "ldr s15, [x26, #0x0]\n" - "ldr s24, [x25, #0x0]\n" - "ldr s23, [x24, #0x0]\n" - "b 122f\n" - "116:" // Height 4: Partial accumulate: partial_2_8 - "tbz x11, #1, 117f\n" - "ldr d11, [x9], #0x8\n" - "ldr d14, [x26], #0x8\n" - "mov x20, #0x28\n" - "ldr d19, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "tbz x11, #0, 122f\n" - "ld1 { v11.s }[2], [x9]\n" - "ld1 { v14.s }[2], [x26]\n" - "ld1 { v19.s }[2], [x25]\n" - "ld1 { v22.s }[2], [x24]\n" - "b 122f\n" - "117:" // Height 4: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 122f\n" - "ldr s11, [x9, #0x0]\n" - "ldr s14, [x26, #0x0]\n" - "ldr s19, [x25, #0x0]\n" - "ldr s22, [x24, #0x0]\n" - "b 122f\n" - "118:" // Height 4: Partial accumulate: partial_4_0 - "tbz x11, #2, 120f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "tbz x11, #1, 119f\n" - "ldr d10, [x9], #0x8\n" - "ldr d13, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d18, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "tbz x11, #0, 122f\n" - "ld1 { v10.s }[2], [x9]\n" - "ld1 { v13.s }[2], [x26]\n" - "ld1 { v18.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "b 122f\n" - "119:" // Height 4: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 122f\n" - "ldr s10, [x9, #0x0]\n" - "ldr s13, [x26, #0x0]\n" - "ldr s18, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "b 122f\n" - "120:" // Height 4: Partial accumulate: partial_2_0 - "tbz x11, #1, 121f\n" - "ldr d9, [x9], #0x8\n" - "ldr d12, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d17, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "tbz x11, #0, 122f\n" - "ld1 { v9.s }[2], [x9]\n" - "ld1 { v12.s }[2], [x26]\n" - "ld1 { v17.s }[2], [x25]\n" - "ld1 { v20.s }[2], [x24]\n" - "b 122f\n" - "121:" // Height 4: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "ldr s12, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s17, [x25, #0x0]\n" - "ldr s20, [x24, #0x0]\n" - "122:" // Height 4: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 124f\n" - "123:" // Height 4: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "ldr q12, [x26, #0x0]\n" - "ldr q13, [x26, #0x10]\n" - "ldr q14, [x26, #0x20]\n" - "ldr q15, [x26, #0x30]\n" - "ldr q17, [x25, #0x0]\n" - "ldr q18, [x25, #0x10]\n" - "ldr q19, [x25, #0x20]\n" - "ldr q24, [x25, #0x30]\n" - "ldr q20, [x24, #0x0]\n" - "ldr q21, [x24, #0x10]\n" - "ldr q22, [x24, #0x20]\n" - "ldr q23, [x24, #0x30]\n" - "124:" // Height 4: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "zip1 v16.2d, v17.2d, v20.2d\n" - "zip2 v20.2d, v17.2d, v20.2d\n" - "zip1 v17.2d, v18.2d, v21.2d\n" - "zip2 v21.2d, v18.2d, v21.2d\n" - "zip1 v18.2d, v19.2d, v22.2d\n" - "zip2 v22.2d, v19.2d, v22.2d\n" - "zip1 v19.2d, v24.2d, v23.2d\n" - "zip2 v23.2d, v24.2d, v23.2d\n" - "b 126f\n" - "125:" // Height 4: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "126:" // Height 4: setup done - "mov x28, #0x0\n" - "127:" // Height 4: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 128f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "cbnz x28, 129f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "b 129f\n" - "128:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #1\n" - "add x24, x25, x21, LSL #1\n" - "add x23, x24, x21, LSL #1\n" - "129:" // Height 4: input setup done - "cmp x27, #0x8\n" - "blt 132f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q2, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "blt 131f\n" - "130:" // Height 4: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "ldr q4, [x23, #0x0]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - "ldr q2, [x25, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "ldr q3, [x24, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "bge 130b\n" - "131:" // Height 4: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "sub x27, x27, #0x8\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "132:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 137f\n" - "cmp x27, #0x4\n" - "blt 134f\n" - "133:" // Height 4: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "cmp x27, #0x4\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - "bge 133b\n" - "134:" // Height 4: Multiply loop: Skip odd blocks - "cbz x27, 137f\n" - "tbz x27, #1, 135f\n" - "ldr s1, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x23], #0x4\n" - "tbz x27, #0, 136f\n" - "ld1 { v1.h }[2], [x26]\n" - "ld1 { v2.h }[2], [x25]\n" - "ld1 { v3.h }[2], [x24]\n" - "ld1 { v4.h }[2], [x23]\n" - "b 136f\n" - "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "ldr h2, [x25, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "ldr h4, [x23, #0x0]\n" - "136:" // Height 4: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "ldr q6, [x10, #0x10]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "137:" // Height 4: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 127b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" - "uzp2 v8.2d, v8.2d, v12.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v12.2d, v9.2d, v13.2d\n" - "uzp2 v9.2d, v9.2d, v13.2d\n" - "uzp1 v13.2d, v10.2d, v14.2d\n" - "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "uzp1 v14.2d, v11.2d, v15.2d\n" - "uzp2 v11.2d, v11.2d, v15.2d\n" - "prfm pstl1keep, [x26, #0x0]\n" - "uzp1 v15.2d, v16.2d, v20.2d\n" - "uzp2 v16.2d, v16.2d, v20.2d\n" - "prfm pstl1keep, [x25, #0x0]\n" - "prfm pstl1keep, [x24, #0x0]\n" - "uzp1 v20.2d, v17.2d, v21.2d\n" - "uzp2 v17.2d, v17.2d, v21.2d\n" - "uzp1 v21.2d, v18.2d, v22.2d\n" - "uzp2 v18.2d, v18.2d, v22.2d\n" - "uzp1 v22.2d, v19.2d, v23.2d\n" - "uzp2 v19.2d, v19.2d, v23.2d\n" - "tbz %x[flags], #1, 138f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v26.4s }, [x21]\n" - "ld1r { v25.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v26.4s\n" - "fmin v12.4s, v12.4s, v26.4s\n" - "fmin v13.4s, v13.4s, v26.4s\n" - "fmin v14.4s, v14.4s, v26.4s\n" - "fmin v8.4s, v8.4s, v26.4s\n" - "fmin v9.4s, v9.4s, v26.4s\n" - "fmin v10.4s, v10.4s, v26.4s\n" - "fmin v11.4s, v11.4s, v26.4s\n" - "fmin v15.4s, v15.4s, v26.4s\n" - "fmin v20.4s, v20.4s, v26.4s\n" - "fmin v21.4s, v21.4s, v26.4s\n" - "fmin v22.4s, v22.4s, v26.4s\n" - "fmin v16.4s, v16.4s, v26.4s\n" - "fmin v17.4s, v17.4s, v26.4s\n" - "fmin v18.4s, v18.4s, v26.4s\n" - "fmin v19.4s, v19.4s, v26.4s\n" - "fmax v7.4s, v7.4s, v25.4s\n" - "fmax v12.4s, v12.4s, v25.4s\n" - "fmax v13.4s, v13.4s, v25.4s\n" - "fmax v14.4s, v14.4s, v25.4s\n" - "fmax v8.4s, v8.4s, v25.4s\n" - "fmax v9.4s, v9.4s, v25.4s\n" - "fmax v10.4s, v10.4s, v25.4s\n" - "fmax v11.4s, v11.4s, v25.4s\n" - "fmax v15.4s, v15.4s, v25.4s\n" - "fmax v20.4s, v20.4s, v25.4s\n" - "fmax v21.4s, v21.4s, v25.4s\n" - "fmax v22.4s, v22.4s, v25.4s\n" - "fmax v16.4s, v16.4s, v25.4s\n" - "fmax v17.4s, v17.4s, v25.4s\n" - "fmax v18.4s, v18.4s, v25.4s\n" - "fmax v19.4s, v19.4s, v25.4s\n" - "138:" // Height 4: No activation - "cmp x11, #0x10\n" - "bge 147f\n" - "tbz x11, #3, 142f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v9.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "st1 { v17.4s }, [x24], #0x10\n" - "tbz x11, #2, 140f\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v10.4s }, [x26], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v18.4s }, [x24], #0x10\n" - "tbz x11, #1, 139f\n" - "str d14, [x9], #0x8\n" - "str d11, [x26], #0x8\n" - "str d22, [x25], #0x8\n" - "str d19, [x24], #0x8\n" - "tbz x11, #0, 146f\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v11.s }[2], [x26]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v19.s }[2], [x24]\n" - "b 146f\n" - "139:" // Height 4: Partial direct writeback: partial_1_12 - "tbz x11, #0, 146f\n" - "str s14, [x9, #0x0]\n" - "str s11, [x26, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s19, [x24, #0x0]\n" - "b 146f\n" - "140:" // Height 4: Partial direct writeback: partial_2_8 - "tbz x11, #1, 141f\n" - "str d13, [x9], #0x8\n" - "str d10, [x26], #0x8\n" - "str d21, [x25], #0x8\n" - "str d18, [x24], #0x8\n" - "tbz x11, #0, 146f\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v10.s }[2], [x26]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v18.s }[2], [x24]\n" - "b 146f\n" - "141:" // Height 4: Partial direct writeback: partial_1_8 - "tbz x11, #0, 146f\n" - "str s13, [x9, #0x0]\n" - "str s10, [x26, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s18, [x24, #0x0]\n" - "b 146f\n" - "142:" // Height 4: Partial direct writeback: partial_4_0 - "tbz x11, #2, 144f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "tbz x11, #1, 143f\n" - "str d12, [x9], #0x8\n" - "str d9, [x26], #0x8\n" - "str d20, [x25], #0x8\n" - "str d17, [x24], #0x8\n" - "tbz x11, #0, 146f\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v9.s }[2], [x26]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v17.s }[2], [x24]\n" - "b 146f\n" - "143:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x11, #0, 146f\n" - "str s12, [x9, #0x0]\n" - "str s9, [x26, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s17, [x24, #0x0]\n" - "b 146f\n" - "144:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x11, #1, 145f\n" - "str d7, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "tbz x11, #0, 146f\n" - "st1 { v7.s }[2], [x9]\n" - "st1 { v8.s }[2], [x26]\n" - "st1 { v15.s }[2], [x25]\n" - "st1 { v16.s }[2], [x24]\n" - "b 146f\n" - "145:" // Height 4: Partial direct writeback: partial_1_0 - "str s7, [x9, #0x0]\n" - "str s8, [x26, #0x0]\n" - "str s15, [x25, #0x0]\n" - "str s16, [x24, #0x0]\n" - "146:" // Height 4: Partial direct writeback: Done - "b 148f\n" - "147:" // Height 4: Full writeback - "str q7, [x9, #0x0]\n" - "str q12, [x9, #0x10]\n" - "str q13, [x9, #0x20]\n" - "str q14, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "str q8, [x26, #0x0]\n" - "str q9, [x26, #0x10]\n" - "str q10, [x26, #0x20]\n" - "str q11, [x26, #0x30]\n" - "str q15, [x25, #0x0]\n" - "str q20, [x25, #0x10]\n" - "str q21, [x25, #0x20]\n" - "str q22, [x25, #0x30]\n" - "str q16, [x24, #0x0]\n" - "str q17, [x24, #0x10]\n" - "str q18, [x24, #0x20]\n" - "str q19, [x24, #0x30]\n" - "148:" // Height 4: Writeback done - "subs x11, x11, #0x10\n" - "bgt 113b\n" - "b 224f\n" - "149:" // Height 5 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "150:" // Height 5: Column loop - "cbz x10, 151f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "mov v16.16b, v8.16b\n" - "mov v20.16b, v12.16b\n" - "mov v17.16b, v9.16b\n" - "mov v21.16b, v13.16b\n" - "mov v18.16b, v10.16b\n" - "mov v22.16b, v14.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v15.16b\n" - "mov v24.16b, v8.16b\n" - "mov v28.16b, v12.16b\n" - "mov v25.16b, v9.16b\n" - "mov v29.16b, v13.16b\n" - "mov v26.16b, v10.16b\n" - "mov v30.16b, v14.16b\n" - "mov v27.16b, v11.16b\n" - "mov v31.16b, v15.16b\n" - "b 163f\n" - "151:" // Height 5: no bias - "tbz %x[flags], #0, 162f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x10\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "bge 160f\n" - "tbz x11, #3, 155f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "ld1 { v13.4s }, [x26], #0x10\n" - "ld1 { v18.4s }, [x25], #0x10\n" - "ld1 { v21.4s }, [x24], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "tbz x11, #2, 153f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "ld1 { v14.4s }, [x26], #0x10\n" - "ld1 { v19.4s }, [x25], #0x10\n" - "ld1 { v22.4s }, [x24], #0x10\n" - "ld1 { v27.4s }, [x23], #0x10\n" - "tbz x11, #1, 152f\n" - "ldr d16, [x9], #0x8\n" - "ldr d15, [x26], #0x8\n" - "mov x20, #0x38\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "ldr d6, [x23], #0x8\n" - "tbz x11, #0, 159f\n" - "ld1 { v16.s }[2], [x9]\n" - "ld1 { v15.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v23.s }[2], [x24]\n" - "ld1 { v6.s }[2], [x23]\n" - "b 159f\n" - "152:" // Height 5: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 159f\n" - "ldr s16, [x9, #0x0]\n" - "ldr s15, [x26, #0x0]\n" - "ldr s24, [x25, #0x0]\n" - "ldr s23, [x24, #0x0]\n" - "ldr s6, [x23, #0x0]\n" - "b 159f\n" - "153:" // Height 5: Partial accumulate: partial_2_8 - "tbz x11, #1, 154f\n" - "ldr d11, [x9], #0x8\n" - "ldr d14, [x26], #0x8\n" - "mov x20, #0x28\n" - "ldr d19, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "tbz x11, #0, 159f\n" - "ld1 { v11.s }[2], [x9]\n" - "ld1 { v14.s }[2], [x26]\n" - "ld1 { v19.s }[2], [x25]\n" - "ld1 { v22.s }[2], [x24]\n" - "ld1 { v27.s }[2], [x23]\n" - "b 159f\n" - "154:" // Height 5: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 159f\n" - "ldr s11, [x9, #0x0]\n" - "ldr s14, [x26, #0x0]\n" - "ldr s19, [x25, #0x0]\n" - "ldr s22, [x24, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "b 159f\n" - "155:" // Height 5: Partial accumulate: partial_4_0 - "tbz x11, #2, 157f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "tbz x11, #1, 156f\n" - "ldr d10, [x9], #0x8\n" - "ldr d13, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d18, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "tbz x11, #0, 159f\n" - "ld1 { v10.s }[2], [x9]\n" - "ld1 { v13.s }[2], [x26]\n" - "ld1 { v18.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "ld1 { v26.s }[2], [x23]\n" - "b 159f\n" - "156:" // Height 5: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 159f\n" - "ldr s10, [x9, #0x0]\n" - "ldr s13, [x26, #0x0]\n" - "ldr s18, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "b 159f\n" - "157:" // Height 5: Partial accumulate: partial_2_0 - "tbz x11, #1, 158f\n" - "ldr d9, [x9], #0x8\n" - "ldr d12, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d17, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "tbz x11, #0, 159f\n" - "ld1 { v9.s }[2], [x9]\n" - "ld1 { v12.s }[2], [x26]\n" - "ld1 { v17.s }[2], [x25]\n" - "ld1 { v20.s }[2], [x24]\n" - "ld1 { v25.s }[2], [x23]\n" - "b 159f\n" - "158:" // Height 5: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "ldr s12, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s17, [x25, #0x0]\n" - "ldr s20, [x24, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "159:" // Height 5: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 161f\n" - "160:" // Height 5: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "ldr q12, [x26, #0x0]\n" - "ldr q13, [x26, #0x10]\n" - "ldr q14, [x26, #0x20]\n" - "ldr q15, [x26, #0x30]\n" - "ldr q17, [x25, #0x0]\n" - "ldr q18, [x25, #0x10]\n" - "ldr q19, [x25, #0x20]\n" - "ldr q24, [x25, #0x30]\n" - "ldr q20, [x24, #0x0]\n" - "ldr q21, [x24, #0x10]\n" - "ldr q22, [x24, #0x20]\n" - "ldr q23, [x24, #0x30]\n" - "ldr q25, [x23, #0x0]\n" - "ldr q26, [x23, #0x10]\n" - "ldr q27, [x23, #0x20]\n" - "ldr q6, [x23, #0x30]\n" - "161:" // Height 5: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "zip1 v16.2d, v17.2d, v20.2d\n" - "zip2 v20.2d, v17.2d, v20.2d\n" - "zip1 v17.2d, v18.2d, v21.2d\n" - "zip2 v21.2d, v18.2d, v21.2d\n" - "zip1 v18.2d, v19.2d, v22.2d\n" - "zip2 v22.2d, v19.2d, v22.2d\n" - "zip1 v19.2d, v24.2d, v23.2d\n" - "zip2 v23.2d, v24.2d, v23.2d\n" - "zip1 v24.2d, v25.2d, v28.2d\n" - "zip2 v28.2d, v25.2d, v28.2d\n" - "zip1 v25.2d, v26.2d, v29.2d\n" - "zip2 v29.2d, v26.2d, v29.2d\n" - "zip1 v26.2d, v27.2d, v30.2d\n" - "zip2 v30.2d, v27.2d, v30.2d\n" - "zip1 v27.2d, v6.2d, v31.2d\n" - "zip2 v31.2d, v6.2d, v31.2d\n" - "b 163f\n" - "162:" // Height 5: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "163:" // Height 5: setup done - "mov x28, #0x0\n" - "164:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 165f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x28, 166f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "add x22, x22, x20, LSL #1\n" - "b 166f\n" - "165:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #1\n" - "add x24, x25, x21, LSL #1\n" - "add x23, x24, x21, LSL #1\n" - "add x22, x23, x21, LSL #1\n" - "166:" // Height 5: input setup done - "cmp x27, #0x8\n" - "blt 169f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q2, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x22, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "blt 168f\n" - "167:" // Height 5: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "sub x27, x27, #0x8\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "add x22, x22, #0x10\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q2, [x25, #0x0]\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - "ldr q4, [x23, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" - "ldr q5, [x22, #0x0]\n" - "bge 167b\n" - "168:" // Height 5: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" - "169:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 174f\n" - "cmp x27, #0x4\n" - "blt 171f\n" - "170:" // Height 5: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "cmp x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" - "bge 170b\n" - "171:" // Height 5: Multiply loop: Skip odd blocks - "cbz x27, 174f\n" - "tbz x27, #1, 172f\n" - "ldr s1, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x22], #0x4\n" - "tbz x27, #0, 173f\n" - "ld1 { v1.h }[2], [x26]\n" - "ld1 { v2.h }[2], [x25]\n" - "ld1 { v3.h }[2], [x24]\n" - "ld1 { v4.h }[2], [x23]\n" - "ld1 { v5.h }[2], [x22]\n" - "b 173f\n" - "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "ldr h2, [x25, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "ldr h4, [x23, #0x0]\n" - "ldr h5, [x22, #0x0]\n" - "173:" // Height 5: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "174:" // Height 5: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 164b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" - "uzp2 v8.2d, v8.2d, v12.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v12.2d, v9.2d, v13.2d\n" - "uzp2 v9.2d, v9.2d, v13.2d\n" - "uzp1 v13.2d, v10.2d, v14.2d\n" - "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "uzp1 v14.2d, v11.2d, v15.2d\n" - "uzp2 v11.2d, v11.2d, v15.2d\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 v15.2d, v16.2d, v20.2d\n" - "uzp2 v16.2d, v16.2d, v20.2d\n" - "prfm pstl1keep, [x25, #0x0]\n" - "uzp1 v20.2d, v17.2d, v21.2d\n" - "uzp2 v17.2d, v17.2d, v21.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "prfm pstl1keep, [x23, #0x0]\n" - "uzp1 v21.2d, v18.2d, v22.2d\n" - "uzp2 v18.2d, v18.2d, v22.2d\n" - "uzp1 v22.2d, v19.2d, v23.2d\n" - "uzp2 v19.2d, v19.2d, v23.2d\n" - "uzp1 v24.2d, v24.2d, v28.2d\n" - "uzp1 v25.2d, v25.2d, v29.2d\n" - "uzp1 v26.2d, v26.2d, v30.2d\n" - "uzp1 v27.2d, v27.2d, v31.2d\n" - "tbz %x[flags], #1, 175f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v1.4s }, [x21]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmin v26.4s, v26.4s, v1.4s\n" - "fmin v27.4s, v27.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" - "175:" // Height 5: No activation - "cmp x11, #0x10\n" - "bge 184f\n" - "tbz x11, #3, 179f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v9.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "st1 { v17.4s }, [x24], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "st1 { v25.4s }, [x23], #0x10\n" - "tbz x11, #2, 177f\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v10.4s }, [x26], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v18.4s }, [x24], #0x10\n" - "st1 { v26.4s }, [x23], #0x10\n" - "tbz x11, #1, 176f\n" - "str d14, [x9], #0x8\n" - "str d11, [x26], #0x8\n" - "str d22, [x25], #0x8\n" - "str d19, [x24], #0x8\n" - "str d27, [x23], #0x8\n" - "tbz x11, #0, 183f\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v11.s }[2], [x26]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v19.s }[2], [x24]\n" - "st1 { v27.s }[2], [x23]\n" - "b 183f\n" - "176:" // Height 5: Partial direct writeback: partial_1_12 - "tbz x11, #0, 183f\n" - "str s14, [x9, #0x0]\n" - "str s11, [x26, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s19, [x24, #0x0]\n" - "str s27, [x23, #0x0]\n" - "b 183f\n" - "177:" // Height 5: Partial direct writeback: partial_2_8 - "tbz x11, #1, 178f\n" - "str d13, [x9], #0x8\n" - "str d10, [x26], #0x8\n" - "str d21, [x25], #0x8\n" - "str d18, [x24], #0x8\n" - "str d26, [x23], #0x8\n" - "tbz x11, #0, 183f\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v10.s }[2], [x26]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v18.s }[2], [x24]\n" - "st1 { v26.s }[2], [x23]\n" - "b 183f\n" - "178:" // Height 5: Partial direct writeback: partial_1_8 - "tbz x11, #0, 183f\n" - "str s13, [x9, #0x0]\n" - "str s10, [x26, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s18, [x24, #0x0]\n" - "str s26, [x23, #0x0]\n" - "b 183f\n" - "179:" // Height 5: Partial direct writeback: partial_4_0 - "tbz x11, #2, 181f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "st1 { v24.4s }, [x23], #0x10\n" - "tbz x11, #1, 180f\n" - "str d12, [x9], #0x8\n" - "str d9, [x26], #0x8\n" - "str d20, [x25], #0x8\n" - "str d17, [x24], #0x8\n" - "str d25, [x23], #0x8\n" - "tbz x11, #0, 183f\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v9.s }[2], [x26]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v17.s }[2], [x24]\n" - "st1 { v25.s }[2], [x23]\n" - "b 183f\n" - "180:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x11, #0, 183f\n" - "str s12, [x9, #0x0]\n" - "str s9, [x26, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s17, [x24, #0x0]\n" - "str s25, [x23, #0x0]\n" - "b 183f\n" - "181:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x11, #1, 182f\n" - "str d7, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "str d24, [x23], #0x8\n" - "tbz x11, #0, 183f\n" - "st1 { v7.s }[2], [x9]\n" - "st1 { v8.s }[2], [x26]\n" - "st1 { v15.s }[2], [x25]\n" - "st1 { v16.s }[2], [x24]\n" - "st1 { v24.s }[2], [x23]\n" - "b 183f\n" - "182:" // Height 5: Partial direct writeback: partial_1_0 - "str s7, [x9, #0x0]\n" - "str s8, [x26, #0x0]\n" - "str s15, [x25, #0x0]\n" - "str s16, [x24, #0x0]\n" - "str s24, [x23, #0x0]\n" - "183:" // Height 5: Partial direct writeback: Done - "b 185f\n" - "184:" // Height 5: Full writeback - "str q7, [x9, #0x0]\n" - "str q12, [x9, #0x10]\n" - "str q13, [x9, #0x20]\n" - "str q14, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "str q8, [x26, #0x0]\n" - "str q9, [x26, #0x10]\n" - "str q10, [x26, #0x20]\n" - "str q11, [x26, #0x30]\n" - "str q15, [x25, #0x0]\n" - "str q20, [x25, #0x10]\n" - "str q21, [x25, #0x20]\n" - "str q22, [x25, #0x30]\n" - "str q16, [x24, #0x0]\n" - "str q17, [x24, #0x10]\n" - "str q18, [x24, #0x20]\n" - "str q19, [x24, #0x30]\n" - "str q24, [x23, #0x0]\n" - "str q25, [x23, #0x10]\n" - "str q26, [x23, #0x20]\n" - "str q27, [x23, #0x30]\n" - "185:" // Height 5: Writeback done - "subs x11, x11, #0x10\n" - "bgt 150b\n" - "b 224f\n" - "186:" // Height 6 - "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x20, #0x18\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x9\n" - "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "187:" // Height 6: Column loop - "cbz x10, 188f\n" - "ldr q8, [x10, #0x0]\n" - "ldr q9, [x10, #0x10]\n" - "ldr q10, [x10, #0x20]\n" - "ldr q11, [x10, #0x30]\n" - "add x10, x10, #0x40\n" - "zip2 v12.2d, v8.2d, v8.2d\n" - "zip1 v8.2d, v8.2d, v8.2d\n" - "zip2 v13.2d, v9.2d, v9.2d\n" - "zip1 v9.2d, v9.2d, v9.2d\n" - "zip2 v14.2d, v10.2d, v10.2d\n" - "zip1 v10.2d, v10.2d, v10.2d\n" - "zip2 v15.2d, v11.2d, v11.2d\n" - "zip1 v11.2d, v11.2d, v11.2d\n" - "mov v16.16b, v8.16b\n" - "mov v20.16b, v12.16b\n" - "mov v17.16b, v9.16b\n" - "mov v21.16b, v13.16b\n" - "mov v18.16b, v10.16b\n" - "mov v22.16b, v14.16b\n" - "mov v19.16b, v11.16b\n" - "mov v23.16b, v15.16b\n" - "mov v24.16b, v8.16b\n" - "mov v28.16b, v12.16b\n" - "mov v25.16b, v9.16b\n" - "mov v29.16b, v13.16b\n" - "mov v26.16b, v10.16b\n" - "mov v30.16b, v14.16b\n" - "mov v27.16b, v11.16b\n" - "mov v31.16b, v15.16b\n" - "b 200f\n" - "188:" // Height 6: no bias - "tbz %x[flags], #0, 199f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x10\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "bge 197f\n" - "tbz x11, #3, 192f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x22], #0x10\n" - "ld1 { v10.4s }, [x9], #0x10\n" - "ld1 { v13.4s }, [x26], #0x10\n" - "ld1 { v18.4s }, [x25], #0x10\n" - "ld1 { v21.4s }, [x24], #0x10\n" - "ld1 { v26.4s }, [x23], #0x10\n" - "ld1 { v29.4s }, [x22], #0x10\n" - "tbz x11, #2, 190f\n" - "ld1 { v11.4s }, [x9], #0x10\n" - "ld1 { v14.4s }, [x26], #0x10\n" - "ld1 { v19.4s }, [x25], #0x10\n" - "ld1 { v22.4s }, [x24], #0x10\n" - "ld1 { v27.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x22], #0x10\n" - "tbz x11, #1, 189f\n" - "ldr d16, [x9], #0x8\n" - "ldr d15, [x26], #0x8\n" - "mov x20, #0x38\n" - "ldr d24, [x25], #0x8\n" - "ldr d23, [x24], #0x8\n" - "ldr d6, [x23], #0x8\n" - "ldr d31, [x22], #0x8\n" - "tbz x11, #0, 196f\n" - "ld1 { v16.s }[2], [x9]\n" - "ld1 { v15.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v23.s }[2], [x24]\n" - "ld1 { v6.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x22]\n" - "b 196f\n" - "189:" // Height 6: Partial accumulate: partial_1_12 - "mov x20, #0x30\n" - "tbz x11, #0, 196f\n" - "ldr s16, [x9, #0x0]\n" - "ldr s15, [x26, #0x0]\n" - "ldr s24, [x25, #0x0]\n" - "ldr s23, [x24, #0x0]\n" - "ldr s6, [x23, #0x0]\n" - "ldr s31, [x22, #0x0]\n" - "b 196f\n" - "190:" // Height 6: Partial accumulate: partial_2_8 - "tbz x11, #1, 191f\n" - "ldr d11, [x9], #0x8\n" - "ldr d14, [x26], #0x8\n" - "mov x20, #0x28\n" - "ldr d19, [x25], #0x8\n" - "ldr d22, [x24], #0x8\n" - "ldr d27, [x23], #0x8\n" - "ldr d30, [x22], #0x8\n" - "tbz x11, #0, 196f\n" - "ld1 { v11.s }[2], [x9]\n" - "ld1 { v14.s }[2], [x26]\n" - "ld1 { v19.s }[2], [x25]\n" - "ld1 { v22.s }[2], [x24]\n" - "ld1 { v27.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x22]\n" - "b 196f\n" - "191:" // Height 6: Partial accumulate: partial_1_8 - "mov x20, #0x20\n" - "tbz x11, #0, 196f\n" - "ldr s11, [x9, #0x0]\n" - "ldr s14, [x26, #0x0]\n" - "ldr s19, [x25, #0x0]\n" - "ldr s22, [x24, #0x0]\n" - "ldr s27, [x23, #0x0]\n" - "ldr s30, [x22, #0x0]\n" - "b 196f\n" - "192:" // Height 6: Partial accumulate: partial_4_0 - "tbz x11, #2, 194f\n" - "ld1 { v9.4s }, [x9], #0x10\n" - "ld1 { v12.4s }, [x26], #0x10\n" - "ld1 { v17.4s }, [x25], #0x10\n" - "ld1 { v20.4s }, [x24], #0x10\n" - "ld1 { v25.4s }, [x23], #0x10\n" - "ld1 { v28.4s }, [x22], #0x10\n" - "tbz x11, #1, 193f\n" - "ldr d10, [x9], #0x8\n" - "ldr d13, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d18, [x25], #0x8\n" - "ldr d21, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d29, [x22], #0x8\n" - "tbz x11, #0, 196f\n" - "ld1 { v10.s }[2], [x9]\n" - "ld1 { v13.s }[2], [x26]\n" - "ld1 { v18.s }[2], [x25]\n" - "ld1 { v21.s }[2], [x24]\n" - "ld1 { v26.s }[2], [x23]\n" - "ld1 { v29.s }[2], [x22]\n" - "b 196f\n" - "193:" // Height 6: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 196f\n" - "ldr s10, [x9, #0x0]\n" - "ldr s13, [x26, #0x0]\n" - "ldr s18, [x25, #0x0]\n" - "ldr s21, [x24, #0x0]\n" - "ldr s26, [x23, #0x0]\n" - "ldr s29, [x22, #0x0]\n" - "b 196f\n" - "194:" // Height 6: Partial accumulate: partial_2_0 - "tbz x11, #1, 195f\n" - "ldr d9, [x9], #0x8\n" - "ldr d12, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d17, [x25], #0x8\n" - "ldr d20, [x24], #0x8\n" - "ldr d25, [x23], #0x8\n" - "ldr d28, [x22], #0x8\n" - "tbz x11, #0, 196f\n" - "ld1 { v9.s }[2], [x9]\n" - "ld1 { v12.s }[2], [x26]\n" - "ld1 { v17.s }[2], [x25]\n" - "ld1 { v20.s }[2], [x24]\n" - "ld1 { v25.s }[2], [x23]\n" - "ld1 { v28.s }[2], [x22]\n" - "b 196f\n" - "195:" // Height 6: Partial accumulate: partial_1_0 - "ldr s9, [x9, #0x0]\n" - "ldr s12, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s17, [x25, #0x0]\n" - "ldr s20, [x24, #0x0]\n" - "ldr s25, [x23, #0x0]\n" - "ldr s28, [x22, #0x0]\n" - "196:" // Height 6: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 198f\n" - "197:" // Height 6: full accumulate - "ldr q9, [x9, #0x0]\n" - "ldr q10, [x9, #0x10]\n" - "ldr q11, [x9, #0x20]\n" - "ldr q16, [x9, #0x30]\n" - "ldr q12, [x26, #0x0]\n" - "ldr q13, [x26, #0x10]\n" - "ldr q14, [x26, #0x20]\n" - "ldr q15, [x26, #0x30]\n" - "ldr q17, [x25, #0x0]\n" - "ldr q18, [x25, #0x10]\n" - "ldr q19, [x25, #0x20]\n" - "ldr q24, [x25, #0x30]\n" - "ldr q20, [x24, #0x0]\n" - "ldr q21, [x24, #0x10]\n" - "ldr q22, [x24, #0x20]\n" - "ldr q23, [x24, #0x30]\n" - "ldr q25, [x23, #0x0]\n" - "ldr q26, [x23, #0x10]\n" - "ldr q27, [x23, #0x20]\n" - "ldr q6, [x23, #0x30]\n" - "ldr q28, [x22, #0x0]\n" - "ldr q29, [x22, #0x10]\n" - "ldr q30, [x22, #0x20]\n" - "ldr q31, [x22, #0x30]\n" - "198:" // Height 6: MMLA fixup - "zip1 v8.2d, v9.2d, v12.2d\n" - "zip2 v12.2d, v9.2d, v12.2d\n" - "zip1 v9.2d, v10.2d, v13.2d\n" - "zip2 v13.2d, v10.2d, v13.2d\n" - "zip1 v10.2d, v11.2d, v14.2d\n" - "zip2 v14.2d, v11.2d, v14.2d\n" - "zip1 v11.2d, v16.2d, v15.2d\n" - "zip2 v15.2d, v16.2d, v15.2d\n" - "zip1 v16.2d, v17.2d, v20.2d\n" - "zip2 v20.2d, v17.2d, v20.2d\n" - "zip1 v17.2d, v18.2d, v21.2d\n" - "zip2 v21.2d, v18.2d, v21.2d\n" - "zip1 v18.2d, v19.2d, v22.2d\n" - "zip2 v22.2d, v19.2d, v22.2d\n" - "zip1 v19.2d, v24.2d, v23.2d\n" - "zip2 v23.2d, v24.2d, v23.2d\n" - "zip1 v24.2d, v25.2d, v28.2d\n" - "zip2 v28.2d, v25.2d, v28.2d\n" - "zip1 v25.2d, v26.2d, v29.2d\n" - "zip2 v29.2d, v26.2d, v29.2d\n" - "zip1 v26.2d, v27.2d, v30.2d\n" - "zip2 v30.2d, v27.2d, v30.2d\n" - "zip1 v27.2d, v6.2d, v31.2d\n" - "zip2 v31.2d, v6.2d, v31.2d\n" - "b 200f\n" - "199:" // Height 6: no accumulate - "movi v8.16b, #0x0\n" - "movi v9.16b, #0x0\n" - "movi v10.16b, #0x0\n" - "movi v11.16b, #0x0\n" - "movi v12.16b, #0x0\n" - "movi v13.16b, #0x0\n" - "movi v14.16b, #0x0\n" - "movi v15.16b, #0x0\n" - "movi v16.16b, #0x0\n" - "movi v17.16b, #0x0\n" - "movi v18.16b, #0x0\n" - "movi v19.16b, #0x0\n" - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "200:" // Height 6: setup done - "mov x28, #0x0\n" - "201:" // Height 6: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 202f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "ldr x21, [x20, #0x28]\n" - "cbnz x28, 203f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #1\n" - "add x25, x25, x20, LSL #1\n" - "add x24, x24, x20, LSL #1\n" - "add x23, x23, x20, LSL #1\n" - "add x22, x22, x20, LSL #1\n" - "add x21, x21, x20, LSL #1\n" - "b 203f\n" - "202:" // Height 6: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #1\n" - "add x24, x25, x21, LSL #1\n" - "add x23, x24, x21, LSL #1\n" - "add x22, x23, x21, LSL #1\n" - "add x21, x22, x21, LSL #1\n" - "203:" // Height 6: input setup done - "cmp x27, #0x8\n" - "blt 206f\n" - "ldr q1, [x26, #0x0]\n" - "ldr q2, [x25, #0x0]\n" - "cmp x27, #0x10\n" - "ldr q3, [x24, #0x0]\n" - "ldr q4, [x23, #0x0]\n" - "ldr q5, [x22, #0x0]\n" - "ldr q6, [x21, #0x0]\n" - "ldr q7, [x10, #0x0]\n" - "blt 205f\n" - "204:" // Height 6: Multiply loop: Main loop head - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "sub x27, x27, #0x8\n" - "add x26, x26, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - "add x23, x23, #0x10\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "add x22, x22, #0x10\n" - "add x21, x21, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "cmp x27, #0x10\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "prfm pldl1keep, [x21, #0x80]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - "ldr q2, [x25, #0x0]\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - "ldr q4, [x23, #0x0]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0x0]\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - "ldr q1, [x26, #0x0]\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - "ldr q3, [x24, #0x0]\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" - "ldr q5, [x22, #0x0]\n" - "ldr q6, [x21, #0x0]\n" - "bge 204b\n" - "205:" // Height 6: Multiply loop: Single iteration only - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn2 v1.2d, v1.2d, v2.2d\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn2 v3.2d, v3.2d, v4.2d\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "trn2 v5.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - "add x22, x22, #0x10\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - "add x21, x21, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - "prfm pldl1keep, [x24, #0x80]\n" - "sub x27, x27, #0x8\n" - "prfm pldl1keep, [x23, #0x80]\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - "prfm pldl1keep, [x22, #0x80]\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - "prfm pldl1keep, [x21, #0x80]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x80]\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x90]\n" - ".inst 0x6e47ec28 // bfmmla v8.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec70 // bfmmla v16.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb8 // bfmmla v24.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xa0]\n" - ".inst 0x6e46ec2c // bfmmla v12.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec74 // bfmmla v20.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbc // bfmmla v28.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xb0]\n" - ".inst 0x6e47ec29 // bfmmla v9.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec71 // bfmmla v17.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecb9 // bfmmla v25.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xc0]\n" - ".inst 0x6e46ec2d // bfmmla v13.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec75 // bfmmla v21.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbd // bfmmla v29.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xd0]\n" - ".inst 0x6e47ec2a // bfmmla v10.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec72 // bfmmla v18.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecba // bfmmla v26.4s, v5.8h, v7.8h\n" - "ldr q7, [x10, #0xe0]\n" - ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec76 // bfmmla v22.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbe // bfmmla v30.4s, v5.8h, v6.8h\n" - "ldr q6, [x10, #0xf0]\n" - "add x10, x10, #0x100\n" - ".inst 0x6e47ec2b // bfmmla v11.4s, v1.8h, v7.8h\n" - ".inst 0x6e47ec73 // bfmmla v19.4s, v3.8h, v7.8h\n" - ".inst 0x6e47ecbb // bfmmla v27.4s, v5.8h, v7.8h\n" - ".inst 0x6e46ec2f // bfmmla v15.4s, v1.8h, v6.8h\n" - ".inst 0x6e46ec77 // bfmmla v23.4s, v3.8h, v6.8h\n" - ".inst 0x6e46ecbf // bfmmla v31.4s, v5.8h, v6.8h\n" - "206:" // Height 6: Multiply loop: Main loop skip - "cbz x27, 211f\n" - "cmp x27, #0x4\n" - "blt 208f\n" - "207:" // Height 6: Multiply loop: Odd block loop - "ldr d1, [x26], #0x8\n" - "ldr d2, [x25], #0x8\n" - "sub x27, x27, #0x4\n" - "ldr d3, [x24], #0x8\n" - "ldr d4, [x23], #0x8\n" - "cmp x27, #0x4\n" - "ldr d5, [x22], #0x8\n" - "ldr d7, [x21], #0x8\n" - "ldr q6, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v7.2d\n" - "ldr q7, [x10, #0x10]\n" - ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec50 // bfmmla v16.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec98 // bfmmla v24.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x20]\n" - ".inst 0x6e47ec0c // bfmmla v12.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec54 // bfmmla v20.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9c // bfmmla v28.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x30]\n" - ".inst 0x6e46ec09 // bfmmla v9.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec51 // bfmmla v17.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec99 // bfmmla v25.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x40]\n" - ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9d // bfmmla v29.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x50]\n" - ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec52 // bfmmla v18.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9a // bfmmla v26.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x60]\n" - ".inst 0x6e47ec0e // bfmmla v14.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec56 // bfmmla v22.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9e // bfmmla v30.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e46ec0b // bfmmla v11.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec53 // bfmmla v19.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9b // bfmmla v27.4s, v4.8h, v6.8h\n" - ".inst 0x6e47ec0f // bfmmla v15.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9f // bfmmla v31.4s, v4.8h, v7.8h\n" - "bge 207b\n" - "208:" // Height 6: Multiply loop: Skip odd blocks - "cbz x27, 211f\n" - "tbz x27, #1, 209f\n" - "ldr s1, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "ldr s3, [x24], #0x4\n" - "ldr s4, [x23], #0x4\n" - "ldr s5, [x22], #0x4\n" - "ldr s6, [x21], #0x4\n" - "tbz x27, #0, 210f\n" - "ld1 { v1.h }[2], [x26]\n" - "ld1 { v2.h }[2], [x25]\n" - "ld1 { v3.h }[2], [x24]\n" - "ld1 { v4.h }[2], [x23]\n" - "ld1 { v5.h }[2], [x22]\n" - "ld1 { v6.h }[2], [x21]\n" - "b 210f\n" - "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0 - "ldr h1, [x26, #0x0]\n" - "ldr h2, [x25, #0x0]\n" - "ldr h3, [x24, #0x0]\n" - "ldr h4, [x23, #0x0]\n" - "ldr h5, [x22, #0x0]\n" - "ldr h6, [x21, #0x0]\n" - "210:" // Height 6: Multiply loop: Ragged operand read: Done - "ldr q7, [x10, #0x0]\n" - "trn1 v0.2d, v1.2d, v2.2d\n" - "trn1 v2.2d, v3.2d, v4.2d\n" - "trn1 v4.2d, v5.2d, v6.2d\n" - "ldr q6, [x10, #0x10]\n" - ".inst 0x6e47ec08 // bfmmla v8.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec50 // bfmmla v16.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec98 // bfmmla v24.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x20]\n" - ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9c // bfmmla v28.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x30]\n" - ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec51 // bfmmla v17.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec99 // bfmmla v25.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x40]\n" - ".inst 0x6e46ec0d // bfmmla v13.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec55 // bfmmla v21.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9d // bfmmla v29.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x50]\n" - ".inst 0x6e47ec0a // bfmmla v10.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec52 // bfmmla v18.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9a // bfmmla v26.4s, v4.8h, v7.8h\n" - "ldr q7, [x10, #0x60]\n" - ".inst 0x6e46ec0e // bfmmla v14.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9e // bfmmla v30.4s, v4.8h, v6.8h\n" - "ldr q6, [x10, #0x70]\n" - "add x10, x10, #0x80\n" - ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n" - ".inst 0x6e47ec53 // bfmmla v19.4s, v2.8h, v7.8h\n" - ".inst 0x6e47ec9b // bfmmla v27.4s, v4.8h, v7.8h\n" - ".inst 0x6e46ec0f // bfmmla v15.4s, v0.8h, v6.8h\n" - ".inst 0x6e46ec57 // bfmmla v23.4s, v2.8h, v6.8h\n" - ".inst 0x6e46ec9f // bfmmla v31.4s, v4.8h, v6.8h\n" - "211:" // Height 6: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 201b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "uzp1 v7.2d, v8.2d, v12.2d\n" - "uzp2 v8.2d, v8.2d, v12.2d\n" - "prfm pstl1keep, [x9, #0x0]\n" - "uzp1 v12.2d, v9.2d, v13.2d\n" - "uzp2 v9.2d, v9.2d, v13.2d\n" - "uzp1 v13.2d, v10.2d, v14.2d\n" - "uzp2 v10.2d, v10.2d, v14.2d\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "uzp1 v14.2d, v11.2d, v15.2d\n" - "uzp2 v11.2d, v11.2d, v15.2d\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x23, x24, x20, LSL #2\n" - "uzp1 v15.2d, v16.2d, v20.2d\n" - "uzp2 v16.2d, v16.2d, v20.2d\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x22, x23, x20, LSL #2\n" - "uzp1 v20.2d, v17.2d, v21.2d\n" - "uzp2 v17.2d, v17.2d, v21.2d\n" - "prfm pstl1keep, [x24, #0x0]\n" - "uzp1 v21.2d, v18.2d, v22.2d\n" - "uzp2 v18.2d, v18.2d, v22.2d\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" - "uzp1 v22.2d, v19.2d, v23.2d\n" - "uzp2 v19.2d, v19.2d, v23.2d\n" - "uzp1 v23.2d, v24.2d, v28.2d\n" - "uzp2 v24.2d, v24.2d, v28.2d\n" - "uzp1 v28.2d, v25.2d, v29.2d\n" - "uzp2 v25.2d, v25.2d, v29.2d\n" - "uzp1 v29.2d, v26.2d, v30.2d\n" - "uzp2 v26.2d, v26.2d, v30.2d\n" - "uzp1 v30.2d, v27.2d, v31.2d\n" - "uzp2 v27.2d, v27.2d, v31.2d\n" - "tbz %x[flags], #1, 212f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v1.4s }, [x21]\n" - "ld1r { v0.4s }, [x20]\n" - "fmin v7.4s, v7.4s, v1.4s\n" - "fmin v12.4s, v12.4s, v1.4s\n" - "fmin v13.4s, v13.4s, v1.4s\n" - "fmin v14.4s, v14.4s, v1.4s\n" - "fmin v8.4s, v8.4s, v1.4s\n" - "fmin v9.4s, v9.4s, v1.4s\n" - "fmin v10.4s, v10.4s, v1.4s\n" - "fmin v11.4s, v11.4s, v1.4s\n" - "fmin v15.4s, v15.4s, v1.4s\n" - "fmin v20.4s, v20.4s, v1.4s\n" - "fmin v21.4s, v21.4s, v1.4s\n" - "fmin v22.4s, v22.4s, v1.4s\n" - "fmin v16.4s, v16.4s, v1.4s\n" - "fmin v17.4s, v17.4s, v1.4s\n" - "fmin v18.4s, v18.4s, v1.4s\n" - "fmin v19.4s, v19.4s, v1.4s\n" - "fmin v23.4s, v23.4s, v1.4s\n" - "fmin v28.4s, v28.4s, v1.4s\n" - "fmin v29.4s, v29.4s, v1.4s\n" - "fmin v30.4s, v30.4s, v1.4s\n" - "fmin v24.4s, v24.4s, v1.4s\n" - "fmin v25.4s, v25.4s, v1.4s\n" - "fmin v26.4s, v26.4s, v1.4s\n" - "fmin v27.4s, v27.4s, v1.4s\n" - "fmax v7.4s, v7.4s, v0.4s\n" - "fmax v12.4s, v12.4s, v0.4s\n" - "fmax v13.4s, v13.4s, v0.4s\n" - "fmax v14.4s, v14.4s, v0.4s\n" - "fmax v8.4s, v8.4s, v0.4s\n" - "fmax v9.4s, v9.4s, v0.4s\n" - "fmax v10.4s, v10.4s, v0.4s\n" - "fmax v11.4s, v11.4s, v0.4s\n" - "fmax v15.4s, v15.4s, v0.4s\n" - "fmax v20.4s, v20.4s, v0.4s\n" - "fmax v21.4s, v21.4s, v0.4s\n" - "fmax v22.4s, v22.4s, v0.4s\n" - "fmax v16.4s, v16.4s, v0.4s\n" - "fmax v17.4s, v17.4s, v0.4s\n" - "fmax v18.4s, v18.4s, v0.4s\n" - "fmax v19.4s, v19.4s, v0.4s\n" - "fmax v23.4s, v23.4s, v0.4s\n" - "fmax v28.4s, v28.4s, v0.4s\n" - "fmax v29.4s, v29.4s, v0.4s\n" - "fmax v30.4s, v30.4s, v0.4s\n" - "fmax v24.4s, v24.4s, v0.4s\n" - "fmax v25.4s, v25.4s, v0.4s\n" - "fmax v26.4s, v26.4s, v0.4s\n" - "fmax v27.4s, v27.4s, v0.4s\n" - "212:" // Height 6: No activation - "cmp x11, #0x10\n" - "bge 221f\n" - "tbz x11, #3, 216f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v12.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v9.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v20.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "st1 { v17.4s }, [x24], #0x10\n" - "st1 { v23.4s }, [x23], #0x10\n" - "st1 { v28.4s }, [x23], #0x10\n" - "st1 { v24.4s }, [x22], #0x10\n" - "st1 { v25.4s }, [x22], #0x10\n" - "tbz x11, #2, 214f\n" - "st1 { v13.4s }, [x9], #0x10\n" - "st1 { v10.4s }, [x26], #0x10\n" - "st1 { v21.4s }, [x25], #0x10\n" - "st1 { v18.4s }, [x24], #0x10\n" - "st1 { v29.4s }, [x23], #0x10\n" - "st1 { v26.4s }, [x22], #0x10\n" - "tbz x11, #1, 213f\n" - "str d14, [x9], #0x8\n" - "str d11, [x26], #0x8\n" - "str d22, [x25], #0x8\n" - "str d19, [x24], #0x8\n" - "str d30, [x23], #0x8\n" - "str d27, [x22], #0x8\n" - "tbz x11, #0, 220f\n" - "st1 { v14.s }[2], [x9]\n" - "st1 { v11.s }[2], [x26]\n" - "st1 { v22.s }[2], [x25]\n" - "st1 { v19.s }[2], [x24]\n" - "st1 { v30.s }[2], [x23]\n" - "st1 { v27.s }[2], [x22]\n" - "b 220f\n" - "213:" // Height 6: Partial direct writeback: partial_1_12 - "tbz x11, #0, 220f\n" - "str s14, [x9, #0x0]\n" - "str s11, [x26, #0x0]\n" - "str s22, [x25, #0x0]\n" - "str s19, [x24, #0x0]\n" - "str s30, [x23, #0x0]\n" - "str s27, [x22, #0x0]\n" - "b 220f\n" - "214:" // Height 6: Partial direct writeback: partial_2_8 - "tbz x11, #1, 215f\n" - "str d13, [x9], #0x8\n" - "str d10, [x26], #0x8\n" - "str d21, [x25], #0x8\n" - "str d18, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "str d26, [x22], #0x8\n" - "tbz x11, #0, 220f\n" - "st1 { v13.s }[2], [x9]\n" - "st1 { v10.s }[2], [x26]\n" - "st1 { v21.s }[2], [x25]\n" - "st1 { v18.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" - "st1 { v26.s }[2], [x22]\n" - "b 220f\n" - "215:" // Height 6: Partial direct writeback: partial_1_8 - "tbz x11, #0, 220f\n" - "str s13, [x9, #0x0]\n" - "str s10, [x26, #0x0]\n" - "str s21, [x25, #0x0]\n" - "str s18, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" - "str s26, [x22, #0x0]\n" - "b 220f\n" - "216:" // Height 6: Partial direct writeback: partial_4_0 - "tbz x11, #2, 218f\n" - "st1 { v7.4s }, [x9], #0x10\n" - "st1 { v8.4s }, [x26], #0x10\n" - "st1 { v15.4s }, [x25], #0x10\n" - "st1 { v16.4s }, [x24], #0x10\n" - "st1 { v23.4s }, [x23], #0x10\n" - "st1 { v24.4s }, [x22], #0x10\n" - "tbz x11, #1, 217f\n" - "str d12, [x9], #0x8\n" - "str d9, [x26], #0x8\n" - "str d20, [x25], #0x8\n" - "str d17, [x24], #0x8\n" - "str d28, [x23], #0x8\n" - "str d25, [x22], #0x8\n" - "tbz x11, #0, 220f\n" - "st1 { v12.s }[2], [x9]\n" - "st1 { v9.s }[2], [x26]\n" - "st1 { v20.s }[2], [x25]\n" - "st1 { v17.s }[2], [x24]\n" - "st1 { v28.s }[2], [x23]\n" - "st1 { v25.s }[2], [x22]\n" - "b 220f\n" - "217:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x11, #0, 220f\n" - "str s12, [x9, #0x0]\n" - "str s9, [x26, #0x0]\n" - "str s20, [x25, #0x0]\n" - "str s17, [x24, #0x0]\n" - "str s28, [x23, #0x0]\n" - "str s25, [x22, #0x0]\n" - "b 220f\n" - "218:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x11, #1, 219f\n" - "str d7, [x9], #0x8\n" - "str d8, [x26], #0x8\n" - "str d15, [x25], #0x8\n" - "str d16, [x24], #0x8\n" - "str d23, [x23], #0x8\n" - "str d24, [x22], #0x8\n" - "tbz x11, #0, 220f\n" - "st1 { v7.s }[2], [x9]\n" - "st1 { v8.s }[2], [x26]\n" - "st1 { v15.s }[2], [x25]\n" - "st1 { v16.s }[2], [x24]\n" - "st1 { v23.s }[2], [x23]\n" - "st1 { v24.s }[2], [x22]\n" - "b 220f\n" - "219:" // Height 6: Partial direct writeback: partial_1_0 - "str s7, [x9, #0x0]\n" - "str s8, [x26, #0x0]\n" - "str s15, [x25, #0x0]\n" - "str s16, [x24, #0x0]\n" - "str s23, [x23, #0x0]\n" - "str s24, [x22, #0x0]\n" - "220:" // Height 6: Partial direct writeback: Done - "b 222f\n" - "221:" // Height 6: Full writeback - "str q7, [x9, #0x0]\n" - "str q12, [x9, #0x10]\n" - "str q13, [x9, #0x20]\n" - "str q14, [x9, #0x30]\n" - "add x9, x9, #0x40\n" - "str q8, [x26, #0x0]\n" - "str q9, [x26, #0x10]\n" - "str q10, [x26, #0x20]\n" - "str q11, [x26, #0x30]\n" - "str q15, [x25, #0x0]\n" - "str q20, [x25, #0x10]\n" - "str q21, [x25, #0x20]\n" - "str q22, [x25, #0x30]\n" - "str q16, [x24, #0x0]\n" - "str q17, [x24, #0x10]\n" - "str q18, [x24, #0x20]\n" - "str q19, [x24, #0x30]\n" - "str q23, [x23, #0x0]\n" - "str q28, [x23, #0x10]\n" - "str q29, [x23, #0x20]\n" - "str q30, [x23, #0x30]\n" - "str q24, [x22, #0x0]\n" - "str q25, [x22, #0x10]\n" - "str q26, [x22, #0x20]\n" - "str q27, [x22, #0x30]\n" - "222:" // Height 6: Writeback done - "subs x11, x11, #0x10\n" - "bgt 187b\n" - "subs %x[m], %x[m], #0x6\n" - "beq 224f\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 223f\n" - "add x21, x21, #0x6\n" - "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "b 1b\n" - "223:" // Update direct input - "mov x20, #0xc\n" - "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" - "b 1b\n" - "224:" // Exit - : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) - : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), - [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), - [offsetof_N] "I"(offsetof(KernelArgs, N)), - [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), - [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), - [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), - [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), - [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), - [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); -} diff --git a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h b/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h deleted file mode 100644 index a44d89b9..00000000 --- a/src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h +++ /dev/null @@ -1,111 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/// Gets `m_step` value. -/// -/// The starting row index must be divisible by `m_step`. -/// -/// @param m Total number of row. -/// -/// @return `m_step` value. -size_t kai_get_m_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m); - -/// Gets `n_step` value. -/// -/// The starting column index must be divisible by `n_step`. -/// -/// @param n Total number of column -/// -/// @return `n_step` value. -size_t kai_get_n_step_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n); - -/// Gets the default row stride in bytes of the LHS matrix. -/// -/// @param[in] k Number of columns. -/// -/// @return The default row stride in bytes of the LHS matrix. -size_t kai_get_lhs_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t k); - -/// Gets the offset in bytes to the data element in the LHS matrix buffer. -/// -/// @param[in] m_idx Row index. -/// @param[in] k_idx Column index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m_idx, size_t k_idx, size_t stride); - -/// Gets the offset in bytes to the data element in the packed RHS matrix buffer. -/// -/// @param[in] k Number of columns. -/// @param[in] n_idx Row index. -/// @param[in] k_idx Column index. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t k, size_t n_idx, size_t k_idx); - -/// Gets the default row stride in bytes of the destination matrix. -/// -/// @param[in] n Number of columns. -/// -/// @return The default row stride in bytes of the destination matrix. -size_t kai_get_dst_default_stride_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t n); - -/// Gets the offset in bytes to the data element in the destination matrix buffer. -/// -/// @param[in] m_idx Row index. -/// @param[in] n_idx Column index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m_idx, size_t n_idx, size_t stride); - -/// Gets the size in bytes of the destination matrix buffer. -/// -/// @param[in] m Number of rows. -/// @param[in] n Number of columns. -/// @param[in] stride Row stride in bytes. -/// -/// @return The size in bytes of the destination matrix buffer. -size_t kai_get_dst_size_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla(size_t m, size_t n, size_t stride); - -/// Runs the matrix multiplication microkernel followed by a clamp operation. -/// -/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset -/// calculated using the following functions: -/// -/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. -/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. -/// * Output: @ref kai_get_dst_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla. -/// -/// @param[in] m Number of output rows to be computed. -/// @param[in] n Number of output columns to be computed. -/// @param[in] k Common dimension of the LHS and RHS operand. -/// @param[in] lhs LHS matrix buffer. -/// @param[in] packed_rhs Packed RHS buffer. -/// @param[in] dst Output matrix buffer. -/// @param[in] lhs_stride Row stride in bytes of the LHS matrix. -/// @param[in] dst_stride Row stride in bytes of the output matrix. -/// @param[in] clamp_min Minimum value to clamp the final result. -/// @param[in] clamp_max Maximum value to clamp the final result. -void kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla( - size_t m, size_t n, size_t k, // - const void* lhs, const void* packed_rhs, void* dst, // - size_t lhs_stride, size_t dst_stride, // - float clamp_min, float clamp_max); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus diff --git a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c deleted file mode 100644 index 14f5b8b5..00000000 --- a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.c +++ /dev/null @@ -1,498 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include - -#include "kai_common.h" - -static const size_t block_height = 16; -static const size_t subblock_width = 4; - -size_t kai_get_n_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n) { - KAI_UNUSED(n); - return 16; -} - -size_t kai_get_k_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t k) { - return k; -} - -size_t kai_get_rhs_default_stride_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n) { - return n * sizeof(uint16_t); -} - -size_t kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t n_idx, size_t k_idx, size_t stride) { - KAI_ASSUME(n_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); - KAI_UNUSED(stride); - - return n_idx * sizeof(uint16_t); -} - -size_t kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t k, size_t n_idx, size_t k_idx) { - KAI_ASSUME(n_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); - - return n_idx / block_height * - (block_height * sizeof(uint32_t) + - block_height * kai_round_up_multiple_usize(k, subblock_width) * sizeof(uint16_t)); -} - -size_t kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n, size_t k) { - return kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - k, kai_round_up_multiple_usize(n, block_height), 0); -} - -void kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t n, size_t k, // - const void* rhs, const void* bias, void* packed_rhs, // - size_t rhs_stride) { - size_t height = k; - const size_t width = n; - const void* in = rhs; - void* out = packed_rhs; - const size_t in_stride = rhs_stride; - uint16_t* pad_row = (uint16_t*)alloca(width * sizeof(uint16_t)); - - if (height % 4) { - memset(pad_row, 0, width * sizeof(uint16_t)); - } - - size_t out_stride = - block_height * kai_round_up_multiple_usize(height, 4) * sizeof(uint16_t) + block_height * sizeof(uint32_t); - - __asm__ __volatile__( - "mov x22, %x[width]\n" - "mov x21, %x[out]\n" - "cmp x22, #0x10\n" - "blt 2f\n" - "1:" // Bias: Full row loop body - "ldr q9, [%x[bias], #0x0]\n" - "sub x22, x22, #0x10\n" - "cmp x22, #0x10\n" - "str q9, [x21, #0x0]\n" - "ldr q22, [%x[bias], #0x10]\n" - "str q22, [x21, #0x10]\n" - "ldr q30, [%x[bias], #0x20]\n" - "str q30, [x21, #0x20]\n" - "ldr q3, [%x[bias], #0x30]\n" - "add %x[bias], %x[bias], #0x40\n" - "str q3, [x21, #0x30]\n" - "add x21, x21, %x[out_stride]\n" - "bge 1b\n" - "2:" // Bias: Tail row loop start - "cbz x22, 4f\n" - "3:" // Bias: Tail row loop body - "ldr w20, [%x[bias], #0x0]\n" - "sub x22, x22, #0x1\n" - "add %x[bias], %x[bias], #0x4\n" - "str w20, [x21]\n" - "add x21, x21, #0x4\n" - "cbnz x22, 3b\n" - "4:" // Bias: Done - "cmp %x[height], #0x10\n" - "add %x[out], %x[out], #0x40\n" - "blt 13f\n" - "5:" // Main row loop: Head - "mov x17, %x[in]\n" - "mov x16, %x[width]\n" - "mov x15, %x[out]\n" - "sub %x[height], %x[height], #0x10\n" - "add x14, x17, %x[in_stride]\n" - "add x13, x14, %x[in_stride]\n" - "add x12, x13, %x[in_stride]\n" - "cmp x16, #0x10\n" - "add x11, x12, %x[in_stride]\n" - "add x10, x11, %x[in_stride]\n" - "add x9, x10, %x[in_stride]\n" - "add x28, x9, %x[in_stride]\n" - "add x27, x28, %x[in_stride]\n" - "add x26, x27, %x[in_stride]\n" - "add x25, x26, %x[in_stride]\n" - "add x24, x25, %x[in_stride]\n" - "add x23, x24, %x[in_stride]\n" - "add x22, x23, %x[in_stride]\n" - "add x21, x22, %x[in_stride]\n" - "add x20, x21, %x[in_stride]\n" - "add %x[in], x20, %x[in_stride]\n" - "blt 7f\n" - "6:" // Main row loop: Column loop - "ldr q4, [x17], #0x10\n" - "ldr q11, [x14], #0x10\n" - "sub x16, x16, #0x10\n" - "ldr q10, [x13], #0x10\n" - "ldr q0, [x12], #0x10\n" - "cmp x16, #0x10\n" - "ldr q14, [x11], #0x10\n" - "ldr q15, [x10], #0x10\n" - "ldr q17, [x9], #0x10\n" - "ldr q16, [x28], #0x10\n" - "ldr q2, [x27], #0x10\n" - "ldr q7, [x26], #0x10\n" - "zip1 v30.8h, v4.8h, v10.8h\n" - "zip1 v26.8h, v11.8h, v0.8h\n" - "ldr q31, [x25], #0x10\n" - "ldr q5, [x24], #0x10\n" - "zip2 v20.8h, v4.8h, v10.8h\n" - "zip2 v4.8h, v11.8h, v0.8h\n" - "ldr q18, [x23], #0x10\n" - "ldr q6, [x22], #0x10\n" - "zip1 v12.8h, v14.8h, v17.8h\n" - "zip1 v1.8h, v15.8h, v16.8h\n" - "ldr q9, [x21], #0x10\n" - "ldr q19, [x20], #0x10\n" - "zip2 v11.8h, v14.8h, v17.8h\n" - "zip2 v14.8h, v15.8h, v16.8h\n" - "ldr q22, [x17], #0x10\n" - "ldr q27, [x14], #0x10\n" - "zip1 v13.8h, v2.8h, v31.8h\n" - "zip1 v0.8h, v7.8h, v5.8h\n" - "ldr q24, [x13], #0x10\n" - "ldr q17, [x12], #0x10\n" - "zip2 v15.8h, v2.8h, v31.8h\n" - "zip2 v21.8h, v7.8h, v5.8h\n" - "ldr q31, [x11], #0x10\n" - "ldr q23, [x10], #0x10\n" - "zip1 v10.8h, v18.8h, v9.8h\n" - "zip1 v8.8h, v6.8h, v19.8h\n" - "ldr q2, [x9], #0x10\n" - "ldr q3, [x28], #0x10\n" - "zip2 v16.8h, v18.8h, v9.8h\n" - "zip2 v9.8h, v6.8h, v19.8h\n" - "ldr q25, [x27], #0x10\n" - "ldr q6, [x26], #0x10\n" - "zip1 v29.8h, v22.8h, v24.8h\n" - "zip1 v18.8h, v27.8h, v17.8h\n" - "ldr q5, [x25], #0x10\n" - "ldr q7, [x24], #0x10\n" - "zip2 v28.8h, v22.8h, v24.8h\n" - "zip2 v27.8h, v27.8h, v17.8h\n" - "ldr q19, [x23], #0x10\n" - "ldr q22, [x22], #0x10\n" - "zip1 v24.8h, v31.8h, v2.8h\n" - "zip1 v17.8h, v23.8h, v3.8h\n" - "zip2 v2.8h, v31.8h, v2.8h\n" - "ldr q31, [x21], #0x10\n" - "zip2 v23.8h, v23.8h, v3.8h\n" - "zip1 v3.8h, v25.8h, v5.8h\n" - "zip2 v5.8h, v25.8h, v5.8h\n" - "zip1 v25.8h, v6.8h, v7.8h\n" - "zip2 v7.8h, v6.8h, v7.8h\n" - "zip1 v6.8h, v19.8h, v31.8h\n" - "zip2 v19.8h, v19.8h, v31.8h\n" - "zip1 v31.8h, v30.8h, v26.8h\n" - "zip2 v26.8h, v30.8h, v26.8h\n" - "ldr q30, [x20], #0x10\n" - "str q31, [x15, #0x0]\n" - "zip1 v31.8h, v20.8h, v4.8h\n" - "zip2 v4.8h, v20.8h, v4.8h\n" - "zip1 v20.8h, v29.8h, v18.8h\n" - "zip2 v29.8h, v29.8h, v18.8h\n" - "zip1 v18.8h, v22.8h, v30.8h\n" - "zip2 v22.8h, v22.8h, v30.8h\n" - "str q26, [x15, #0x10]\n" - "zip1 v30.8h, v28.8h, v27.8h\n" - "zip2 v27.8h, v28.8h, v27.8h\n" - "str q31, [x15, #0x20]\n" - "zip1 v26.8h, v12.8h, v1.8h\n" - "zip2 v31.8h, v12.8h, v1.8h\n" - "str q4, [x15, #0x30]\n" - "zip1 v1.8h, v11.8h, v14.8h\n" - "zip2 v11.8h, v11.8h, v14.8h\n" - "str q20, [x15, #0x40]\n" - "zip1 v14.8h, v24.8h, v17.8h\n" - "zip2 v17.8h, v24.8h, v17.8h\n" - "str q29, [x15, #0x50]\n" - "zip1 v29.8h, v2.8h, v23.8h\n" - "zip2 v12.8h, v2.8h, v23.8h\n" - "str q30, [x15, #0x60]\n" - "zip1 v30.8h, v13.8h, v0.8h\n" - "zip2 v13.8h, v13.8h, v0.8h\n" - "str q27, [x15, #0x70]\n" - "zip1 v0.8h, v15.8h, v21.8h\n" - "zip2 v28.8h, v15.8h, v21.8h\n" - "str q26, [x15, #0x80]\n" - "zip1 v27.8h, v3.8h, v25.8h\n" - "zip2 v26.8h, v3.8h, v25.8h\n" - "str q31, [x15, #0x90]\n" - "zip1 v2.8h, v5.8h, v7.8h\n" - "zip2 v24.8h, v5.8h, v7.8h\n" - "str q1, [x15, #0xa0]\n" - "zip1 v23.8h, v10.8h, v8.8h\n" - "zip2 v7.8h, v10.8h, v8.8h\n" - "str q11, [x15, #0xb0]\n" - "zip1 v21.8h, v16.8h, v9.8h\n" - "zip2 v3.8h, v16.8h, v9.8h\n" - "str q14, [x15, #0xc0]\n" - "zip1 v20.8h, v6.8h, v18.8h\n" - "zip2 v18.8h, v6.8h, v18.8h\n" - "str q17, [x15, #0xd0]\n" - "zip1 v17.8h, v19.8h, v22.8h\n" - "zip2 v16.8h, v19.8h, v22.8h\n" - "str q29, [x15, #0xe0]\n" - "str q12, [x15, #0xf0]\n" - "str q30, [x15, #0x100]\n" - "str q13, [x15, #0x110]\n" - "str q0, [x15, #0x120]\n" - "str q28, [x15, #0x130]\n" - "str q27, [x15, #0x140]\n" - "str q26, [x15, #0x150]\n" - "str q2, [x15, #0x160]\n" - "str q24, [x15, #0x170]\n" - "str q23, [x15, #0x180]\n" - "str q7, [x15, #0x190]\n" - "str q21, [x15, #0x1a0]\n" - "str q3, [x15, #0x1b0]\n" - "str q20, [x15, #0x1c0]\n" - "str q18, [x15, #0x1d0]\n" - "str q17, [x15, #0x1e0]\n" - "str q16, [x15, #0x1f0]\n" - "add x15, x15, %x[out_stride]\n" - "bge 6b\n" - "7:" // Main row loop: Column loop skip - "cbz x16, 12f\n" - "cmp x16, #0x4\n" - "movi v16.8h, #0x0\n" - "str q16, [x15, #0x0]\n" - "str q16, [x15, #0x10]\n" - "str q16, [x15, #0x20]\n" - "str q16, [x15, #0x30]\n" - "str q16, [x15, #0x40]\n" - "str q16, [x15, #0x50]\n" - "str q16, [x15, #0x60]\n" - "str q16, [x15, #0x70]\n" - "str q16, [x15, #0x80]\n" - "str q16, [x15, #0x90]\n" - "str q16, [x15, #0xa0]\n" - "str q16, [x15, #0xb0]\n" - "str q16, [x15, #0xc0]\n" - "str q16, [x15, #0xd0]\n" - "str q16, [x15, #0xe0]\n" - "str q16, [x15, #0xf0]\n" - "str q16, [x15, #0x100]\n" - "str q16, [x15, #0x110]\n" - "str q16, [x15, #0x120]\n" - "str q16, [x15, #0x130]\n" - "str q16, [x15, #0x140]\n" - "str q16, [x15, #0x150]\n" - "str q16, [x15, #0x160]\n" - "str q16, [x15, #0x170]\n" - "str q16, [x15, #0x180]\n" - "str q16, [x15, #0x190]\n" - "str q16, [x15, #0x1a0]\n" - "str q16, [x15, #0x1b0]\n" - "str q16, [x15, #0x1c0]\n" - "str q16, [x15, #0x1d0]\n" - "str q16, [x15, #0x1e0]\n" - "str q16, [x15, #0x1f0]\n" - "blt 9f\n" - "8:" // Main row loop: width 4 loop: loop - "ldr d22, [x17], #0x8\n" - "ldr d21, [x14], #0x8\n" - "sub x16, x16, #0x4\n" - "ldr d19, [x13], #0x8\n" - "ldr d18, [x12], #0x8\n" - "cmp x16, #0x4\n" - "ldr d23, [x11], #0x8\n" - "ldr d20, [x10], #0x8\n" - "ldr d17, [x9], #0x8\n" - "ldr d16, [x28], #0x8\n" - "ldr d28, [x27], #0x8\n" - "ldr d27, [x26], #0x8\n" - "zip1 v22.8h, v22.8h, v19.8h\n" - "zip1 v19.8h, v21.8h, v18.8h\n" - "ldr d21, [x25], #0x8\n" - "ldr d18, [x24], #0x8\n" - "ldr d26, [x23], #0x8\n" - "ldr d25, [x22], #0x8\n" - "zip1 v24.8h, v23.8h, v17.8h\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "ldr d20, [x21], #0x8\n" - "ldr d16, [x20], #0x8\n" - "zip1 v23.8h, v22.8h, v19.8h\n" - "zip2 v19.8h, v22.8h, v19.8h\n" - "zip1 v22.8h, v28.8h, v21.8h\n" - "zip1 v18.8h, v27.8h, v18.8h\n" - "zip1 v21.8h, v24.8h, v17.8h\n" - "zip2 v17.8h, v24.8h, v17.8h\n" - "zip1 v20.8h, v26.8h, v20.8h\n" - "zip1 v16.8h, v25.8h, v16.8h\n" - "str q23, [x15, #0x0]\n" - "str q19, [x15, #0x10]\n" - "zip1 v19.8h, v22.8h, v18.8h\n" - "zip2 v18.8h, v22.8h, v18.8h\n" - "str q21, [x15, #0x80]\n" - "str q17, [x15, #0x90]\n" - "zip1 v17.8h, v20.8h, v16.8h\n" - "zip2 v16.8h, v20.8h, v16.8h\n" - "str q19, [x15, #0x100]\n" - "str q18, [x15, #0x110]\n" - "str q17, [x15, #0x180]\n" - "str q16, [x15, #0x190]\n" - "add x15, x15, #0x20\n" - "bge 8b\n" - "9:" // Main row loop: width 4 loop: skip - "cmp x16, #0x1\n" - "blt 11f\n" - "10:" // Main row loop: width 1 loop: loop - "ldr h23, [x17], #0x2\n" - "ldr h21, [x14], #0x2\n" - "sub x16, x16, #0x1\n" - "ldr h20, [x13], #0x2\n" - "ldr h19, [x12], #0x2\n" - "cmp x16, #0x1\n" - "ldr h22, [x11], #0x2\n" - "ldr h18, [x10], #0x2\n" - "ldr h17, [x9], #0x2\n" - "ldr h16, [x28], #0x2\n" - "ldr h27, [x27], #0x2\n" - "ldr h26, [x26], #0x2\n" - "zip1 v25.8h, v23.8h, v20.8h\n" - "zip1 v21.8h, v21.8h, v19.8h\n" - "ldr h20, [x25], #0x2\n" - "ldr h19, [x24], #0x2\n" - "ldr h24, [x23], #0x2\n" - "ldr h23, [x22], #0x2\n" - "zip1 v22.8h, v22.8h, v17.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "ldr h18, [x21], #0x2\n" - "ldr h16, [x20], #0x2\n" - "zip1 v21.8h, v25.8h, v21.8h\n" - "zip1 v20.8h, v27.8h, v20.8h\n" - "zip1 v19.8h, v26.8h, v19.8h\n" - "zip1 v17.8h, v22.8h, v17.8h\n" - "zip1 v18.8h, v24.8h, v18.8h\n" - "zip1 v16.8h, v23.8h, v16.8h\n" - "str d21, [x15, #0x0]\n" - "str d17, [x15, #0x80]\n" - "zip1 v17.8h, v20.8h, v19.8h\n" - "zip1 v16.8h, v18.8h, v16.8h\n" - "str d17, [x15, #0x100]\n" - "str d16, [x15, #0x180]\n" - "add x15, x15, #0x8\n" - "bge 10b\n" - "11:" // Main row loop: width 1 loop: skip - "12:" // Main row loop: odd col skip - "cmp %x[height], #0x10\n" - "add %x[out], %x[out], #0x200\n" - "bge 5b\n" - "cbz %x[height], 22f\n" - "13:" // Main loop skip - "14:" // Tail row loop: Head - "mov x17, %x[in]\n" - "mov x20, %x[width]\n" - "cmp %x[height], #0x3\n" - "mov x15, %x[out]\n" - "add x14, x17, %x[in_stride]\n" - "add x13, x14, %x[in_stride]\n" - "add x12, x13, %x[in_stride]\n" - "csel x13, x13, %x[pad_row], GE\n" - "add %x[in], x12, %x[in_stride]\n" - "csel x12, x12, %x[pad_row], GT\n" - "cmp %x[height], #0x1\n" - "sub %x[height], %x[height], #0x4\n" - "csel x14, x14, %x[pad_row], GT\n" - "cmp x20, #0x10\n" - "blt 16f\n" - "15:" // Tail row loop: Column loop - "ldr q20, [x17], #0x10\n" - "ldr q19, [x14], #0x10\n" - "sub x20, x20, #0x10\n" - "ldr q18, [x13], #0x10\n" - "ldr q17, [x12], #0x10\n" - "cmp x20, #0x10\n" - "ldr q24, [x17], #0x10\n" - "ldr q25, [x14], #0x10\n" - "ldr q23, [x13], #0x10\n" - "ldr q16, [x12], #0x10\n" - "zip1 v22.8h, v20.8h, v18.8h\n" - "zip1 v21.8h, v19.8h, v17.8h\n" - "zip2 v20.8h, v20.8h, v18.8h\n" - "zip2 v19.8h, v19.8h, v17.8h\n" - "zip1 v18.8h, v24.8h, v23.8h\n" - "zip1 v17.8h, v25.8h, v16.8h\n" - "zip2 v24.8h, v24.8h, v23.8h\n" - "zip2 v16.8h, v25.8h, v16.8h\n" - "zip1 v23.8h, v22.8h, v21.8h\n" - "zip2 v22.8h, v22.8h, v21.8h\n" - "zip1 v21.8h, v20.8h, v19.8h\n" - "zip2 v20.8h, v20.8h, v19.8h\n" - "zip1 v19.8h, v18.8h, v17.8h\n" - "zip2 v18.8h, v18.8h, v17.8h\n" - "zip1 v17.8h, v24.8h, v16.8h\n" - "zip2 v16.8h, v24.8h, v16.8h\n" - "str q23, [x15, #0x0]\n" - "str q22, [x15, #0x10]\n" - "str q21, [x15, #0x20]\n" - "str q20, [x15, #0x30]\n" - "str q19, [x15, #0x40]\n" - "str q18, [x15, #0x50]\n" - "str q17, [x15, #0x60]\n" - "str q16, [x15, #0x70]\n" - "add x15, x15, %x[out_stride]\n" - "bge 15b\n" - "16:" // Tail row loop: Column loop skip - "cbz x20, 21f\n" - "cmp x20, #0x4\n" - "movi v16.8h, #0x0\n" - "str q16, [x15, #0x0]\n" - "str q16, [x15, #0x10]\n" - "str q16, [x15, #0x20]\n" - "str q16, [x15, #0x30]\n" - "str q16, [x15, #0x40]\n" - "str q16, [x15, #0x50]\n" - "str q16, [x15, #0x60]\n" - "str q16, [x15, #0x70]\n" - "blt 18f\n" - "17:" // Tail row loop: width 4 loop: loop - "ldr d18, [x17], #0x8\n" - "ldr d19, [x14], #0x8\n" - "sub x20, x20, #0x4\n" - "ldr d17, [x13], #0x8\n" - "ldr d16, [x12], #0x8\n" - "cmp x20, #0x4\n" - "zip1 v18.8h, v18.8h, v17.8h\n" - "zip1 v16.8h, v19.8h, v16.8h\n" - "zip1 v17.8h, v18.8h, v16.8h\n" - "zip2 v16.8h, v18.8h, v16.8h\n" - "str q17, [x15, #0x0]\n" - "str q16, [x15, #0x10]\n" - "add x15, x15, #0x20\n" - "bge 17b\n" - "18:" // Tail row loop: width 4 loop: skip - "cmp x20, #0x1\n" - "blt 20f\n" - "19:" // Tail row loop: width 1 loop: loop - "ldr h19, [x17], #0x2\n" - "ldr h18, [x14], #0x2\n" - "sub x20, x20, #0x1\n" - "ldr h17, [x13], #0x2\n" - "ldr h16, [x12], #0x2\n" - "cmp x20, #0x1\n" - "zip1 v17.8h, v19.8h, v17.8h\n" - "zip1 v16.8h, v18.8h, v16.8h\n" - "zip1 v16.8h, v17.8h, v16.8h\n" - "str d16, [x15, #0x0]\n" - "add x15, x15, #0x8\n" - "bge 19b\n" - "20:" // Tail row loop: width 1 loop: skip - "21:" // Tail row loop: odd col skip - "cmp %x[height], #0x1\n" - "add %x[out], %x[out], #0x80\n" - "bge 14b\n" - "22:" // Done - : [bias] "+&r"(bias), [height] "+&r"(height), [in] "+&r"(in), [out] "+&r"(out) - : [in_stride] "r"(in_stride), [out_stride] "r"(out_stride), [pad_row] "r"(pad_row), [width] "r"(width) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", - "x25", "x26", "x27", "x28"); -} diff --git a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h b/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h deleted file mode 100644 index ba45e6e7..00000000 --- a/src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h +++ /dev/null @@ -1,90 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/// Gets `n_step` value. -/// -/// The starting row index must be divisible by `n_step`. -/// -/// @param n Total number of row. -/// -/// @return `n_step` value. -size_t kai_get_n_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n); - -/// Gets `k_step` value. -/// -/// The starting column index must be divisible by `k_step`. -/// -/// @param k Total number of column. -/// -/// @return `k_step` value. -size_t kai_get_k_step_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t k); - -/// Gets the default row stride in bytes of the RHS matrix. -/// -/// @param[in] n Number of columns. -/// -/// @return The default row stride in bytes of the LHS matrix. -size_t kai_get_rhs_default_stride_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n); - -/// Gets the offset in bytes to the data element in the RHS matrix buffer. -/// -/// @param[in] n_idx Column index. -/// @param[in] k_idx Row index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t n_idx, size_t k_idx, size_t stride); - -/// Gets the offset in bytes to the data element in the packed RHS buffer. -/// -/// @param[in] k Number of columns. -/// @param[in] n_idx Row index. -/// @param[in] k_idx Column index. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t k, size_t n_idx, size_t k_idx); - -/// Gets the size in bytes of the packed RHS buffer. -/// -/// @param[in] n Number of rows. -/// @param[in] k Number of columns. -/// -/// @return The size in bytes of the packed RHS buffer. -size_t kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon(size_t n, size_t k); - -/// Runs the matrix multiplication microkernel followed by a clamp operation. -/// -/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset -/// calculated using the following functions: -/// -/// * LHS: @ref kai_get_lhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. -/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. -/// * Output: @ref kai_get_dst_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon. -/// -/// @param[in] n Number of output rows to be computed. -/// @param[in] k Number of output columns to be computed. -/// @param[in] rhs RHS matrix buffer. -/// @param[in] bias Bias matrix buffer. -/// @param[in] packed_rhs Packed RHS buffer. -/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. -void kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon( - size_t n, size_t k, // - const void* rhs, const void* bias, void* packed_rhs, // - size_t rhs_stride); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus diff --git a/test/common/bfloat16.hpp b/test/common/bfloat16.hpp index c8c2aac8..8b61d92e 100644 --- a/test/common/bfloat16.hpp +++ b/test/common/bfloat16.hpp @@ -6,8 +6,6 @@ #pragma once -#include - #include #include #include @@ -39,38 +37,42 @@ public: /// Move assignment. BFloat16& operator=(BFloat16&&) = default; - /// Creates a new object from the `bfloat16_t` value. - constexpr explicit BFloat16(bfloat16_t value) : _data(value) { - } - /// Creates a new object from the specified numeric value. template , bool> = true> - constexpr explicit BFloat16(T value) : _data(static_cast(static_cast(value))) { + explicit BFloat16(T value) : _data(0) { + const auto value_f32 = static_cast(value); + asm("bfcvt %h[output], %s[input]" : [output] "=w"(_data) : [input] "w"(value_f32)); } /// Assigns to the specified numeric value which will be converted to `bfloat16_t`. template , bool> = true> BFloat16& operator=(T value) { - _data = static_cast(value); + const auto value_f32 = static_cast(value); + asm("bfcvt %h[output], %s[input]" : [output] "=w"(_data) : [input] "w"(value_f32)); return *this; } /// Converts to numeric type `T`. template , bool> = true> explicit operator T() const { - return static_cast(_data); + union { + float f32; + uint32_t u32; + } data; + + data.u32 = static_cast(_data) << 16; + + return static_cast(data.f32); } /// Equality operator. bool operator==(BFloat16 rhs) const { - const auto* lhs_data = reinterpret_cast(&_data); - const auto* rhs_data = reinterpret_cast(&rhs._data); - return *lhs_data == *rhs_data; + return _data == rhs._data; } /// Unequality operator. bool operator!=(BFloat16 rhs) const { - return !(*this == rhs); + return _data != rhs._data; } /// Writes the value to the output stream. @@ -82,7 +84,7 @@ public: friend std::ostream& operator<<(std::ostream& os, BFloat16 value); private: - bfloat16_t _data; + uint16_t _data; }; } // namespace kai::test diff --git a/test/common/compare.cpp b/test/common/compare.cpp index ec561515..9dcfb104 100644 --- a/test/common/compare.cpp +++ b/test/common/compare.cpp @@ -55,7 +55,7 @@ bool compare_raw( y >= rect.start_row() && y < rect.end_row() && x >= rect.start_col() && x < rect.end_col(); const auto imp_value = read_array(imp_data, y * full_width + x); - const auto ref_value = in_roi ? read_array(ref_data, y * full_width + x) : 0; + const auto ref_value = in_roi ? read_array(ref_data, y * full_width + x) : static_cast(0); const auto [abs_err, rel_err] = calculate_error(imp_value, ref_value); @@ -199,10 +199,10 @@ bool compare( case DataFormat::PackFormat::BIAS_PER_ROW: if (data_type == DataType::FP16 && offset_dt == DataType::FP16) { - return compare_per_row( + return compare_per_row( imp_data, ref_data, format, full_height, full_width, rect, handler); } else if (data_type == DataType::BF16 && offset_dt == DataType::FP32) { - return compare_per_row( + return compare_per_row( imp_data, ref_data, format, full_height, full_width, rect, handler); } diff --git a/test/common/float16.hpp b/test/common/float16.hpp index 5cb2ba93..7cbc9b24 100644 --- a/test/common/float16.hpp +++ b/test/common/float16.hpp @@ -11,7 +11,7 @@ namespace kai::test { /// Half-precision floating-point. -using Float16 = _Float16; +using Float16 = __fp16; /// Writes the value to the output stream. /// diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index d666d903..803fa23b 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -13,6 +13,7 @@ #include "src/kai_common.h" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" +#include "test/common/float16.hpp" #include "test/common/int4.hpp" #include "test/common/memory.hpp" #include "test/reference/binary_elementwise.hpp" @@ -169,7 +170,7 @@ std::vector matmul( break; case DataType::FP16: - tmp_dst = matmul_any_type<_Float16>(lhs, rhs, m, n, k, lhs_transposed, rhs_transposed); + tmp_dst = matmul_any_type(lhs, rhs, m, n, k, lhs_transposed, rhs_transposed); break; default: diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index 93d6e8ef..141892b2 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -85,7 +85,7 @@ std::vector dequantize_any_type( for (size_t x = 0; x < width; ++x) { const ZeroPoint input = read_array(data, y * width + x); - const Scale output = Scale(input - zero_point) * scale; + const Scale output = static_cast(input - zero_point) * scale; write_array(dst.data(), y * width + x, output); } } diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp index 74c87c04..bee2182a 100644 --- a/test/reference/transpose.cpp +++ b/test/reference/transpose.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include "kai_common.h" diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index f20ead47..06bd3a84 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -22,9 +22,7 @@ #include "src/kai_common.h" #include "src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h" -#include "src/matmul/matmul_clamp_f32_bf16_bf16_f32/kai_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla.h" #include "src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h" -#include "src/matmul/matmul_transpose_pack_rhs_bias_bf16_f32/kai_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon.h" #include "test/common/compare.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" @@ -185,7 +183,9 @@ struct MatMulMethod { KAI_UNUSED(rhs_stride); if (fn_main_hybrid_fp16) { - fn_main_hybrid_fp16(m, n, k, lhs, rhs, dst, lhs_stride, dst_stride, clamp_min, clamp_max); + fn_main_hybrid_fp16( + m, n, k, lhs, rhs, dst, lhs_stride, dst_stride, static_cast(clamp_min), + static_cast(clamp_max)); } else { KAI_ERROR("Main kernel is not available!"); } @@ -226,38 +226,6 @@ static const std::array matmul_methods = { .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla, }, - - MatMulMethod{ - .name = "matmul_nt_nt_f32_bf16_bf16_6x16_neon_mmla", - - .m0 = 6, - .n0 = 16, - .k0 = 0, // Not applicable. - - .lhs_transposed = false, - .rhs_transposed = false, - - .dst_format = DataFormat(DataType::FP32), - .lhs_format = DataFormat(DataType::BF16), - .packed_lhs_format = DataFormat(DataType::UNKNOWN), - .rhs_format = DataFormat(DataType::BF16), - .packed_rhs_format = DataFormat( - DataType::BF16, 16, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP32, DataType::UNKNOWN, 16, 4), - .bias_format = DataFormat(DataType::FP32), - - .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla, - .fn_get_packed_lhs_size = nullptr, - .fn_get_packed_lhs_offset = nullptr, - .fn_pack_lhs = nullptr, - - .fn_get_rhs_offset = kai_get_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, - .fn_get_packed_rhs_size = kai_get_packed_rhs_size_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, - .fn_get_packed_rhs_offset = - kai_get_packed_rhs_offset_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, - .fn_pack_rhs = kai_run_matmul_transpose_pack_rhs_bias_bf16p16x4zf32_bf16_f32_neon, - - .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f32_bf16_bf16p16x4zf32_6x16_neon_mmla, - }, }; /// Matrix multiplication shape. @@ -274,10 +242,12 @@ using MatMulTestParams = std::tuple; void PrintTo(const MatMulTestParams& param, std::ostream* os) { const auto& [method_no, shape, portion] = param; + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index) *os << "method: " << matmul_methods[method_no].name << ", m: " << shape.m << ", n: " << shape.n << ", k: " << shape.k << ", portion: { start_row: " << portion.start_row() << ", start_col: " << portion.start_col() << ", height: " << portion.height() << ", width: " << portion.width() << "}"; + // NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index) } /// Matrix multiplication test fixture. @@ -375,10 +345,14 @@ protected: } private: + // NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) static std::map _data; + // NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) }; +// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) std::map MatMulTest::_data; +// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) /// Tests the LHS packing kernel. TEST_P(MatMulTest, PackedLhs) { -- GitLab From eb3537e97c438ba84a5a89151ec9d55828971325 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Thu, 30 May 2024 10:50:45 +0100 Subject: [PATCH 3/3] Address comments Signed-off-by: Viet-Hoa Do --- CMakeLists.txt | 4 +- ...s_pack_kxn_f16p16x1biasf16_f16_f16_neon.c} | 65 +++++++------- ...hs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h | 80 +++++++++++++++++ ...f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c} | 62 +++++++------ ...f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h} | 72 ++++++++------- ...ias_pack_transpose_f16_f16p16x1zf16_neon.h | 90 ------------------- test/tests/matmul_test.cpp | 74 ++++++++++----- 7 files changed, 246 insertions(+), 201 deletions(-) rename src/matmul/{matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c => kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c} (77%) create mode 100644 src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h rename src/matmul/{matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c => matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c} (98%) rename src/matmul/{matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h => matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h} (57%) delete mode 100644 src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f22873b4..0ff56f89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,8 +62,8 @@ set(KLEIDIAI_FILES_NEON ) set(KLEIDIAI_FILES_NEON_FP16 - src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c - src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c + src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c + src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c ) set(KLEIDIAI_FILES_NEON_DOTPROD diff --git a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c b/src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c similarity index 77% rename from src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c rename to src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c index bea751d3..4bf04303 100644 --- a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.c +++ b/src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c @@ -4,58 +4,61 @@ // SPDX-License-Identifier: Apache-2.0 // +#if !defined(__aarch64__) +#error This file must be compiled for AArch64. +#else // Architectural features check. + #include #include #include "kai_common.h" -static const size_t block_height = 16; +static const size_t kai_nr = 16; +static const size_t kai_kr = 1; -size_t kai_get_n_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n) { - KAI_UNUSED(n); - return 16; +size_t kai_get_n_step_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(void) { + return kai_nr; } -size_t kai_get_k_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t k) { - return k; -} +size_t kai_get_rhs_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n_idx) { + KAI_ASSUME(n_idx % kai_nr == 0); -size_t kai_get_rhs_default_stride_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n) { - return n * sizeof(uint16_t); + return n_idx * sizeof(uint16_t); } - -size_t kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t n_idx, size_t k_idx, size_t stride) { - KAI_ASSUME(n_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); - KAI_UNUSED(stride); - +size_t kai_get_bias_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n_idx) { return n_idx * sizeof(uint16_t); } -size_t kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t k, size_t n_idx, size_t k_idx) { - KAI_ASSUME(n_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); +size_t kai_get_rhs_packed_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t k, size_t n_idx) { + KAI_ASSUME(n_idx % kai_nr == 0); - return n_idx / block_height * (block_height * sizeof(uint16_t) + block_height * k * sizeof(uint16_t)); + return n_idx / kai_nr * (kai_nr * sizeof(uint16_t) + kai_nr * k * sizeof(uint16_t)); } -size_t kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n, size_t k) { - return kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - k, kai_roundup(n, block_height), 0); +size_t kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n, size_t k) { + return kai_get_rhs_packed_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(k, kai_roundup(n, kai_nr)); } -void kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t n, size_t k, // - const void* rhs, const void* bias, void* packed_rhs, // - size_t rhs_stride) { +void kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params) { + KAI_ASSUME(num_groups == 1); + KAI_ASSUME(nr == kai_nr); + KAI_ASSUME(kr == kai_kr); + KAI_ASSUME(sr == 1); + KAI_ASSUME(rhs != NULL); + KAI_ASSUME(bias != NULL); + KAI_ASSUME(scale == NULL); + KAI_ASSUME(rhs_packed != NULL); + KAI_ASSUME(extra_bytes == 0); + KAI_ASSUME(params == NULL); + size_t height = k; const size_t width = n; const void* in = rhs; - void* out = packed_rhs; + void* out = rhs_packed; const size_t in_stride = rhs_stride; - size_t out_stride = block_height * height * sizeof(uint16_t) + block_height * sizeof(uint16_t); + size_t out_stride = kai_nr * height * sizeof(uint16_t) + kai_nr * sizeof(uint16_t); __asm__ __volatile__( "mov x22, %x[width]\n" @@ -218,3 +221,5 @@ void kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "x22", "x23", "x24", "x25"); } + +#endif // Architectural features check. diff --git a/src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h b/src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h new file mode 100644 index 00000000..7b131d0d --- /dev/null +++ b/src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h @@ -0,0 +1,80 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets n step value. +/// +/// The starting row index must be divisible by `n_step`. +/// +/// @return The n step value. +size_t kai_get_n_step_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(void); + +/// Gets the offset in bytes to the data element in the RHS matrix buffer. +/// +/// @param[in] n_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n_idx); + +/// Gets the offset in bytes to the data element in the bias buffer. +/// +/// @param[in] n_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_bias_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n_idx); + +/// Gets the offset in bytes to the data element in the packed RHS buffer. +/// +/// @param[in] k Number of columns. +/// @param[in] n_idx Row index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_packed_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t k, size_t n_idx); + +/// Gets the size in bytes of the packed RHS buffer. +/// +/// @param[in] n Number of rows. +/// @param[in] k Number of columns. +/// +/// @return The size in bytes of the packed RHS buffer. +size_t kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(size_t n, size_t k); + +/// Runs the RHS packing function for matrix multiplication. +/// +/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset +/// calculated using the following functions: +/// +/// * RHS: @ref kai_get_rhs_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon. +/// * Bias: @ref kai_get_packed_rhs_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon. +/// * Output: @ref kai_get_dst_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon. +/// +/// @param[in] num_groups Number of groups. It must be 1. +/// @param[in] n Number of columns of the output matrix. +/// @param[in] k Common dimension between the LHS and RHS matrix. +/// @param[in] nr Block size in N dimension. It must be 16. +/// @param[in] kr Block size in K dimension. It must be 1. +/// @param[in] sr Number of kr splits. It must be 1. +/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. +/// @param[in] rhs RHS matrix data buffer. +/// @param[in] bias Bias matrix data buffer. +/// @param[in] scale Scale data buffer. It must be NULL. +/// @param[out] rhs_packed Packed RHS matrix. +/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. +/// @param[in] params Extra packing parameters. It must be NULL. +void kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c b/src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c similarity index 98% rename from src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c rename to src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c index 685ccf43..c4996ebd 100644 --- a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.c +++ b/src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c @@ -4,61 +4,67 @@ // SPDX-License-Identifier: Apache-2.0 // +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ + !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#error This file must be compiled for AArch64, FEAT_FP16. +#else // Architectural features check. + #include #include #include #include "kai_common.h" -static const size_t block_height = 6; -static const size_t block_width = 16; +static const size_t kai_mr = 6; +static const size_t kai_nr = 16; +static const size_t kai_kr = 1; +static const size_t kai_sr = 1; -size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m) { - KAI_UNUSED(m); +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { + return kai_mr; +} - return 6; +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { + return kai_nr; } -size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n) { - KAI_UNUSED(n); +size_t kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { + return kai_nr; +} - return 16; +size_t kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { + return kai_kr; } -size_t kai_get_lhs_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k) { - return k * sizeof(__fp16); +size_t kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void) { + return kai_sr; } -size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t k_idx, size_t stride) { - KAI_ASSUME(m_idx % block_height == 0); - KAI_ASSUME(k_idx == 0); +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m_idx, size_t stride) { + KAI_ASSUME(m_idx % kai_mr == 0); return m_idx * stride; } -size_t kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k, size_t n_idx, size_t k_idx) { - KAI_ASSUME(n_idx % block_width == 0); - KAI_ASSUME(k_idx == 0); +size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t n_idx, size_t k) { + KAI_ASSUME(n_idx % kai_nr == 0); - return n_idx / block_width * (block_width * sizeof(__fp16) + block_width * k * sizeof(__fp16)); + return n_idx / kai_nr * (kai_nr * sizeof(__fp16) + kai_nr * k * sizeof(__fp16)); } -size_t kai_get_dst_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n) { - return n * sizeof(__fp16); -} - -size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t n_idx, size_t stride) { - KAI_ASSUME(m_idx % block_height == 0); - KAI_ASSUME(n_idx % block_width == 0); +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( + size_t m_idx, size_t n_idx, size_t stride) { + KAI_ASSUME(m_idx % kai_mr == 0); + KAI_ASSUME(n_idx % kai_nr == 0); return m_idx * stride + n_idx * sizeof(__fp16); } -size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m, size_t n, size_t stride) { - return m * stride + n * sizeof(__fp16); +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m, size_t n) { + return m * n * sizeof(__fp16); } -void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( +void kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( size_t m, size_t n, size_t k, // const void* lhs, const void* packed_rhs, void* dst, // size_t lhs_stride, size_t dst_stride, // @@ -3021,3 +3027,5 @@ void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } + +#endif // Architectural features check. diff --git a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h b/src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h similarity index 57% rename from src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h rename to src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h index 01ad4d5c..d98e5442 100644 --- a/src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h +++ b/src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h @@ -6,61 +6,67 @@ #pragma once +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ + !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#error This file must be compiled for AArch64, FEAT_FP16. +#else // Architectural features check. + #include #ifdef __cplusplus extern "C" { #endif // __cplusplus -/// Gets `m_step` value. +/// Gets m step value. /// /// The starting row index must be divisible by `m_step`. /// -/// @param m Total number of row. -/// -/// @return `m_step` value. -size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m); +/// @return The m step value. +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void); -/// Gets `n_step` value. +/// Gets n step value. /// /// The starting column index must be divisible by `n_step`. /// -/// @param n Total number of column +/// @return The n step value. +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void); + +/// Gets nr value. +/// +/// This is the packing parameter which must be used to pack the RHS matrix. /// -/// @return `n_step` value. -size_t kai_get_n_step_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n); +/// @return The nr value. +size_t kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void); -/// Gets the default row stride in bytes of the LHS matrix. +/// Gets kr value. /// -/// @param[in] k Number of columns. +/// This is the packing parameter which must be used to pack the RHS matrix. +/// +/// @return The kr value. +size_t kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void); + +/// Gets sr value. +/// +/// This is the packing parameter which must be used to pack the RHS matrix. /// -/// @return The default row stride in bytes of the LHS matrix. -size_t kai_get_lhs_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k); +/// @return The sr value. +size_t kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(void); /// Gets the offset in bytes to the data element in the LHS matrix buffer. /// /// @param[in] m_idx Row index. -/// @param[in] k_idx Column index. /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes to the data element. -size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t k_idx, size_t stride); +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m_idx, size_t stride); /// Gets the offset in bytes to the data element in the packed RHS matrix buffer. /// -/// @param[in] k Number of columns. /// @param[in] n_idx Row index. -/// @param[in] k_idx Column index. +/// @param[in] k Number of columns. /// /// @return The offset in bytes to the data element. -size_t kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t k, size_t n_idx, size_t k_idx); - -/// Gets the default row stride in bytes of the destination matrix. -/// -/// @param[in] n Number of columns. -/// -/// @return The default row stride in bytes of the destination matrix. -size_t kai_get_dst_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t n); +size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t n_idx, size_t k); /// Gets the offset in bytes to the data element in the destination matrix buffer. /// @@ -69,25 +75,25 @@ size_t kai_get_dst_default_stride_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_ml /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes to the data element. -size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m_idx, size_t n_idx, size_t stride); +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( + size_t m_idx, size_t n_idx, size_t stride); /// Gets the size in bytes of the destination matrix buffer. /// /// @param[in] m Number of rows. /// @param[in] n Number of columns. -/// @param[in] stride Row stride in bytes. /// /// @return The size in bytes of the destination matrix buffer. -size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m, size_t n, size_t stride); +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla(size_t m, size_t n); /// Runs the matrix multiplication microkernel followed by a clamp operation. /// /// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset /// calculated using the following functions: /// -/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. -/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. -/// * Output: @ref kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla. +/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla. +/// * Packed RHS: @ref kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla. +/// * Output: @ref kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla. /// /// @param[in] m Number of output rows to be computed. /// @param[in] n Number of output columns to be computed. @@ -99,7 +105,7 @@ size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla(size_t m /// @param[in] dst_stride Row stride in bytes of the output matrix. /// @param[in] clamp_min Minimum value to clamp the final result. /// @param[in] clamp_max Maximum value to clamp the final result. -void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( +void kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla( size_t m, size_t n, size_t k, // const void* lhs, const void* packed_rhs, void* dst, // size_t lhs_stride, size_t dst_stride, // @@ -108,3 +114,5 @@ void kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla( #ifdef __cplusplus } // extern "C" #endif // __cplusplus + +#endif // Architectural features check. diff --git a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h b/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h deleted file mode 100644 index 117d69d8..00000000 --- a/src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h +++ /dev/null @@ -1,90 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/// Gets `n_step` value. -/// -/// The starting row index must be divisible by `n_step`. -/// -/// @param n Total number of row. -/// -/// @return `n_step` value. -size_t kai_get_n_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n); - -/// Gets `k_step` value. -/// -/// The starting column index must be divisible by `k_step`. -/// -/// @param k Total number of column. -/// -/// @return `k_step` value. -size_t kai_get_k_step_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t k); - -/// Gets the default row stride in bytes of the RHS matrix. -/// -/// @param[in] n Number of columns. -/// -/// @return The default row stride in bytes of the LHS matrix. -size_t kai_get_rhs_default_stride_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n); - -/// Gets the offset in bytes to the data element in the RHS matrix buffer. -/// -/// @param[in] n_idx Column index. -/// @param[in] k_idx Row index. -/// @param[in] stride Row stride in bytes. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t n_idx, size_t k_idx, size_t stride); - -/// Gets the offset in bytes to the data element in the packed RHS buffer. -/// -/// @param[in] k Number of columns. -/// @param[in] n_idx Row index. -/// @param[in] k_idx Column index. -/// -/// @return The offset in bytes to the data element. -size_t kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t k, size_t n_idx, size_t k_idx); - -/// Gets the size in bytes of the packed RHS buffer. -/// -/// @param[in] n Number of rows. -/// @param[in] k Number of columns. -/// -/// @return The size in bytes of the packed RHS buffer. -size_t kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon(size_t n, size_t k); - -/// Runs the matrix multiplication microkernel followed by a clamp operation. -/// -/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset -/// calculated using the following functions: -/// -/// * LHS: @ref kai_get_lhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. -/// * Packed RHS: @ref kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. -/// * Output: @ref kai_get_dst_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon. -/// -/// @param[in] n Number of output rows to be computed. -/// @param[in] k Number of output columns to be computed. -/// @param[in] rhs RHS matrix buffer. -/// @param[in] bias Bias matrix buffer. -/// @param[in] packed_rhs Packed RHS buffer. -/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. -void kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon( - size_t n, size_t k, // - const void* rhs, const void* bias, void* packed_rhs, // - size_t rhs_stride); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index 06bd3a84..544d17e9 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -21,8 +21,8 @@ #include #include "src/kai_common.h" -#include "src/matmul/matmul_clamp_f16_f16_f16_f16/kai_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla.h" -#include "src/matmul/matmul_rhs_bias_pack_transpose_f16_f16_f16/kai_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon.h" +#include "src/matmul/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h" +#include "src/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h" #include "test/common/compare.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" @@ -54,14 +54,41 @@ struct MatMulMethod { DataFormat packed_rhs_format; ///< Data format of the packed RHS matrix. DataFormat bias_format; ///< Data format of the bias vector. + /// Gets mr value. + /// + /// This is the packing parameter which must be used to pack the LHS matrix (if necessary). + /// + /// @return The mr value. + std::function fn_get_mr; + + /// Gets nr value. + /// + /// This is the packing parameter which must be used to pack the RHS matrix (if necessary). + /// + /// @return The nr value. + std::function fn_get_nr; + + /// Gets kr value. + /// + /// This is the packing parameter which must be used to pack the LHS and RHS matrix (if necessary). + /// + /// @return The kr value. + std::function fn_get_kr; + + /// Gets sr value. + /// + /// This is the packing parameter which must be used to pack the RHS matrix. + /// + /// @return The sr value. + std::function fn_get_sr; + /// Gets the offset in bytes of the LHS matrix. /// /// @param[in] m_idx Coordinate of the matrix in M dimension. - /// @param[in] k_idx Coordinate of the matrix in K dimension. /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes. - std::function fn_get_lhs_offset; + std::function fn_get_lhs_offset; /// Gets the size in bytes of the packed LHS matrix. /// @@ -88,6 +115,7 @@ struct MatMulMethod { /// @param[out] packed_lhs Packed LHS matrix data buffer. std::function fn_pack_lhs; + /// Gets a value indicating whether LHS packing is needed. [[nodiscard]] bool is_pack_lhs_needed() const { return fn_pack_lhs != nullptr; } @@ -95,11 +123,9 @@ struct MatMulMethod { /// Gets the offset in bytes of the RHS matrix. /// /// @param[in] n_idx Coordinate of the matrix in N dimension. - /// @param[in] k_idx Coordinate of the matrix in K dimension. - /// @param[in] stride Row stride in bytes. /// /// @return The offset in bytes. - std::function fn_get_rhs_offset; + std::function fn_get_rhs_offset; /// Gets the size in bytes of the packed RHS matrix. /// @@ -113,12 +139,13 @@ struct MatMulMethod { /// /// @param[in] k Size of the matrix in K dimension. /// @param[in] n_idx Coordinate of the matrix in N dimension. - /// @param[in] k_idx Coordinate of the matrix in K dimension. /// /// @return The offset in bytes. - std::function fn_get_packed_rhs_offset; + std::function fn_get_packed_rhs_offset; - std::function + std::function fn_pack_rhs; /// Performs matrix multiplication. @@ -166,7 +193,9 @@ struct MatMulMethod { KAI_UNUSED(packed_rhs); if (fn_pack_rhs != nullptr) { - fn_pack_rhs(n, k, rhs, bias, packed_rhs, rhs_row_stride); + fn_pack_rhs( + 1, n, k, fn_get_nr(), fn_get_kr(), fn_get_sr(), rhs_row_stride, rhs, bias, nullptr, packed_rhs, 0, + nullptr); } else { KAI_ERROR("RHS pre-processing is not supported!"); } @@ -214,17 +243,22 @@ static const std::array matmul_methods = { DataType::FP16, 16, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP16, DataType::UNKNOWN, 16, 1), .bias_format = DataFormat(DataType::FP16), - .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla, + .fn_get_mr = nullptr, + .fn_get_nr = kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla, + .fn_get_kr = kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla, + .fn_get_sr = kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla, + + .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla, .fn_get_packed_lhs_size = nullptr, .fn_get_packed_lhs_offset = nullptr, .fn_pack_lhs = nullptr, - .fn_get_rhs_offset = kai_get_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, - .fn_get_packed_rhs_size = kai_get_packed_rhs_size_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, - .fn_get_packed_rhs_offset = kai_get_packed_rhs_offset_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, - .fn_pack_rhs = kai_run_matmul_rhs_bias_pack_transpose_f16_f16p16x1zf16_neon, + .fn_get_rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon, + .fn_get_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon, + .fn_get_packed_rhs_offset = kai_get_rhs_packed_offset_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon, + .fn_pack_rhs = kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon, - .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f16_f16_f16p16x1zf16_6x16_neon_mla, + .fn_main_hybrid_fp16 = kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla, }, }; @@ -381,7 +415,7 @@ TEST_P(MatMulTest, PackedLhs) { const auto ref_packed_lhs_size = method.packed_lhs_format.default_size_in_bytes(lhs_h, lhs_w); ASSERT_EQ(packed_lhs_size, ref_packed_lhs_size); - const auto lhs_offset = method.fn_get_lhs_offset(rect.start_row(), 0, ref_lhs_row_stride); + const auto lhs_offset = method.fn_get_lhs_offset(rect.start_row(), ref_lhs_row_stride); const auto ref_lhs_offset = method.lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w); ASSERT_EQ(lhs_offset, ref_lhs_offset); @@ -428,7 +462,7 @@ TEST_P(MatMulTest, PackedRhs) { const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w); - const auto rhs_offset = method.fn_get_rhs_offset(rect.start_row(), rect.start_col(), ref_rhs_row_stride); + const auto rhs_offset = method.fn_get_rhs_offset(rect.start_row()); const auto ref_rhs_offset = method.rhs_format.default_offset_in_bytes(rhs_start_row, rhs_start_col, rhs_w); ASSERT_EQ(rhs_offset, ref_rhs_offset); @@ -436,7 +470,7 @@ TEST_P(MatMulTest, PackedRhs) { const auto ref_packed_rhs_size = method.packed_rhs_format.default_size_in_bytes(packed_rhs_h, packed_rhs_w); ASSERT_EQ(packed_rhs_size, ref_packed_rhs_size); - const auto packed_rhs_offset = method.fn_get_packed_rhs_offset(info.k, rect.start_row(), rect.start_col()); + const auto packed_rhs_offset = method.fn_get_packed_rhs_offset(info.k, rect.start_row()); const auto ref_packed_rhs_offset = method.packed_rhs_format.default_offset_in_bytes(rect.start_row(), rect.start_col(), packed_rhs_w); ASSERT_EQ(packed_rhs_offset, ref_packed_rhs_offset); -- GitLab