From 6d5bf0b8ff7ce7ba23da9d0151136d330ca97a48 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 30 Sep 2024 01:33:11 +0300 Subject: [PATCH] Add fp16 hybrid gemm kernel with 6x32 output block size Signed-off-by: Gunes Bayir --- CMakeLists.txt | 2 + ..._f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c | 5352 +++++++++++++++++ ..._f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h | 126 + ...hs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c | 296 + ...hs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h | 80 + test/tests/matmul_test.cpp | 52 + 6 files changed, 5908 insertions(+) create mode 100644 kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c create mode 100644 kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h create mode 100644 kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c create mode 100644 kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ee4f3304..fa9b5f2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,9 @@ set(KLEIDIAI_FILES_SCALAR set(KLEIDIAI_FILES_NEON_FP16 kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c + kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c + kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c ) set(KLEIDIAI_FILES_NEON diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c new file mode 100644 index 00000000..4c04f4a5 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.c @@ -0,0 +1,5352 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ + !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#error This file must be compiled for AArch64, FEAT_FP16. +#else // Architectural features check. + +#include "kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h" + +#include +#include +#include + +#include "kai/kai_common.h" + +static const size_t kai_mr = 6; +static const size_t kai_nr = 32; +static const size_t kai_kr = 1; +static const size_t kai_sr = 1; + +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void) { + return kai_mr; +} + +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void) { + return kai_nr; +} + +size_t kai_get_nr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void) { + return kai_nr; +} + +size_t kai_get_kr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void) { + return kai_kr; +} + +size_t kai_get_sr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void) { + return kai_sr; +} + +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t m_idx, size_t stride) { + KAI_ASSUME(m_idx % kai_mr == 0); + + return m_idx * stride; +} + +size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t n_idx, size_t k) { + KAI_ASSUME(n_idx % kai_nr == 0); + + return n_idx / kai_nr * (kai_nr * sizeof(__fp16) + kai_nr * k * sizeof(__fp16)); +} + +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla( + size_t m_idx, size_t n_idx, size_t stride) { + KAI_ASSUME(m_idx % kai_mr == 0); + KAI_ASSUME(n_idx % kai_nr == 0); + + return m_idx * stride + n_idx * sizeof(__fp16); +} + +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t m, size_t n) { + return m * n * sizeof(__fp16); +} + +void kai_run_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla( + size_t m, size_t n, size_t k, // + const void* lhs, size_t lhs_stride, // + const void* rhs_packed, // + void* dst, size_t dst_stride_row, size_t dst_stride_col, // + __fp16 clamp_min, __fp16 clamp_max) { + KAI_ASSERT(dst_stride_col == sizeof(__fp16)); + + typedef struct { + __fp16 maxval; + __fp16 minval; + unsigned int num_strings; + const unsigned int* string_lengths; + size_t N; + const void* B_ptr; + size_t output_offset; + size_t input_initial_col; + size_t input_offset; + void* output_ptr; + const void* bias; + } KernelArgs; + + KernelArgs ka; + + unsigned long flags = 0; + + unsigned int string_length = k; + ka.num_strings = 1; + ka.string_lengths = &string_length; + ka.N = n; + ka.B_ptr = rhs_packed; + ka.bias = NULL; + + // Direct input. + const void* input_ptr = lhs; + ka.input_offset = lhs_stride / sizeof(__fp16); + ka.input_initial_col = 0; + + // Direct output. + ka.output_ptr = dst; + ka.output_offset = dst_stride_row / sizeof(__fp16); + + // Clamping output. + flags |= 0x2; + ka.maxval = clamp_max; + ka.minval = clamp_min; + + __asm__ __volatile__( + "1:" // Row loop + "cmp %x[m], #0x6\n" + "bge 246f\n" + "cmp %x[m], #0x4\n" + "bgt 197f\n" + "beq 148f\n" + "cmp %x[m], #0x2\n" + "bgt 99f\n" + "beq 50f\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "2:" // Height 1: Column loop + "cbz x10, 3f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "b 22f\n" + "3:" // Height 1: no bias + "tbz %x[flags], #0, 21f\n" + "cmp x11, #0x20\n" + "bge 20f\n" + "tbz x11, #4, 11f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "tbz x11, #3, 7f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "tbz x11, #2, 5f\n" + "ldr d11, [x9], #0x8\n" + "tbz x11, #1, 4f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "mov x20, #0x3c\n" + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[6], [x9]\n" + "b 19f\n" + "4:" // Height 1: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[4], [x9]\n" + "b 19f\n" + "5:" // Height 1: Partial accumulate: partial_2_24 + "tbz x11, #1, 6f\n" + "ldr s11, [x9], #0x4\n" + "mov x20, #0x34\n" + "tbz x11, #0, 19f\n" + "ld1 { v11.h }[2], [x9]\n" + "b 19f\n" + "6:" // Height 1: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 19f\n" + "ldr h11, [x9, #0x0]\n" + "b 19f\n" + "7:" // Height 1: Partial accumulate: partial_4_16 + "tbz x11, #2, 9f\n" + "ldr d10, [x9], #0x8\n" + "tbz x11, #1, 8f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "mov x20, #0x2c\n" + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[6], [x9]\n" + "b 19f\n" + "8:" // Height 1: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[4], [x9]\n" + "b 19f\n" + "9:" // Height 1: Partial accumulate: partial_2_16 + "tbz x11, #1, 10f\n" + "ldr s10, [x9], #0x4\n" + "mov x20, #0x24\n" + "tbz x11, #0, 19f\n" + "ld1 { v10.h }[2], [x9]\n" + "b 19f\n" + "10:" // Height 1: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 19f\n" + "ldr h10, [x9, #0x0]\n" + "b 19f\n" + "11:" // Height 1: Partial accumulate: partial_8_0 + "tbz x11, #3, 15f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "tbz x11, #2, 13f\n" + "ldr d9, [x9], #0x8\n" + "tbz x11, #1, 12f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "mov x20, #0x1c\n" + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[6], [x9]\n" + "b 19f\n" + "12:" // Height 1: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[4], [x9]\n" + "b 19f\n" + "13:" // Height 1: Partial accumulate: partial_2_8 + "tbz x11, #1, 14f\n" + "ldr s9, [x9], #0x4\n" + "mov x20, #0x14\n" + "tbz x11, #0, 19f\n" + "ld1 { v9.h }[2], [x9]\n" + "b 19f\n" + "14:" // Height 1: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 19f\n" + "ldr h9, [x9, #0x0]\n" + "b 19f\n" + "15:" // Height 1: Partial accumulate: partial_4_0 + "tbz x11, #2, 17f\n" + "ldr d8, [x9], #0x8\n" + "tbz x11, #1, 16f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "mov x20, #0xc\n" + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[6], [x9]\n" + "b 19f\n" + "16:" // Height 1: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[4], [x9]\n" + "b 19f\n" + "17:" // Height 1: Partial accumulate: partial_2_0 + "tbz x11, #1, 18f\n" + "ldr s8, [x9], #0x4\n" + "mov x20, #0x4\n" + "tbz x11, #0, 19f\n" + "ld1 { v8.h }[2], [x9]\n" + "b 19f\n" + "18:" // Height 1: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "mov x20, #0x0\n" + "19:" // Height 1: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 22f\n" + "20:" // Height 1: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "b 22f\n" + "21:" // Height 1: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "22:" // Height 1: setup done + "mov x28, #0x0\n" + "23:" // Height 1: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 24f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "cbnz x28, 25f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "b 25f\n" + "24:" // Height 1: setup direct input + "mov x26, %x[input_ptr]\n" + "25:" // Height 1: input setup done + "cmp x27, #0x8\n" + "blt 28f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q7, [x10, #0x10]\n" + "blt 27f\n" + "26:" // Height 1: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 26b\n" + "27:" // Height 1: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x26, x26, #0x10\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "28:" // Height 1: Multiply loop: Main loop skip + "cbz x27, 30f\n" + "29:" // Height 1: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr q6, [x10, #0x0]\n" + "sub x27, x27, #0x1\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "cbnz x27, 29b\n" + "30:" // Height 1: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 23b\n" + "prfm pstl1keep, [x9, #0x0]\n" + "tbz %x[flags], #1, 31f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" + "31:" // Height 1: No activation + "cmp x11, #0x20\n" + "bge 48f\n" + "tbz x11, #4, 39f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "tbz x11, #3, 35f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "tbz x11, #2, 33f\n" + "str d11, [x9], #0x8\n" + "tbz x11, #1, 32f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v11.h }[6], [x9]\n" + "b 47f\n" + "32:" // Height 1: Partial direct writeback: partial_1_28 + "tbz x11, #0, 47f\n" + "st1 { v11.h }[4], [x9]\n" + "b 47f\n" + "33:" // Height 1: Partial direct writeback: partial_2_24 + "tbz x11, #1, 34f\n" + "str s11, [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v11.h }[2], [x9]\n" + "b 47f\n" + "34:" // Height 1: Partial direct writeback: partial_1_24 + "tbz x11, #0, 47f\n" + "str h11, [x9, #0x0]\n" + "b 47f\n" + "35:" // Height 1: Partial direct writeback: partial_4_16 + "tbz x11, #2, 37f\n" + "str d10, [x9], #0x8\n" + "tbz x11, #1, 36f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v10.h }[6], [x9]\n" + "b 47f\n" + "36:" // Height 1: Partial direct writeback: partial_1_20 + "tbz x11, #0, 47f\n" + "st1 { v10.h }[4], [x9]\n" + "b 47f\n" + "37:" // Height 1: Partial direct writeback: partial_2_16 + "tbz x11, #1, 38f\n" + "str s10, [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v10.h }[2], [x9]\n" + "b 47f\n" + "38:" // Height 1: Partial direct writeback: partial_1_16 + "tbz x11, #0, 47f\n" + "str h10, [x9, #0x0]\n" + "b 47f\n" + "39:" // Height 1: Partial direct writeback: partial_8_0 + "tbz x11, #3, 43f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "tbz x11, #2, 41f\n" + "str d9, [x9], #0x8\n" + "tbz x11, #1, 40f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v9.h }[6], [x9]\n" + "b 47f\n" + "40:" // Height 1: Partial direct writeback: partial_1_12 + "tbz x11, #0, 47f\n" + "st1 { v9.h }[4], [x9]\n" + "b 47f\n" + "41:" // Height 1: Partial direct writeback: partial_2_8 + "tbz x11, #1, 42f\n" + "str s9, [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v9.h }[2], [x9]\n" + "b 47f\n" + "42:" // Height 1: Partial direct writeback: partial_1_8 + "tbz x11, #0, 47f\n" + "str h9, [x9, #0x0]\n" + "b 47f\n" + "43:" // Height 1: Partial direct writeback: partial_4_0 + "tbz x11, #2, 45f\n" + "str d8, [x9], #0x8\n" + "tbz x11, #1, 44f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v8.h }[6], [x9]\n" + "b 47f\n" + "44:" // Height 1: Partial direct writeback: partial_1_4 + "tbz x11, #0, 47f\n" + "st1 { v8.h }[4], [x9]\n" + "b 47f\n" + "45:" // Height 1: Partial direct writeback: partial_2_0 + "tbz x11, #1, 46f\n" + "str s8, [x9], #0x4\n" + "tbz x11, #0, 47f\n" + "st1 { v8.h }[2], [x9]\n" + "b 47f\n" + "46:" // Height 1: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "47:" // Height 1: Partial direct writeback: Done + "b 49f\n" + "48:" // Height 1: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "49:" // Height 1: Writeback done + "subs x11, x11, #0x20\n" + "bgt 2b\n" + "b 296f\n" + "50:" // Height 2 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "51:" // Height 2: Column loop + "cbz x10, 52f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "mov v12.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "b 71f\n" + "52:" // Height 2: no bias + "tbz %x[flags], #0, 70f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x26, x9, x20, LSL #1\n" + "bge 69f\n" + "tbz x11, #4, 60f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "ld1 { v13.8h }, [x26], #0x10\n" + "tbz x11, #3, 56f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "ld1 { v14.8h }, [x26], #0x10\n" + "tbz x11, #2, 54f\n" + "ldr d11, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "tbz x11, #1, 53f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "ld1 { v15.s }[2], [x26], #0x4\n" + "mov x20, #0x3c\n" + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[6], [x9]\n" + "ld1 { v15.h }[6], [x26]\n" + "b 68f\n" + "53:" // Height 2: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[4], [x9]\n" + "ld1 { v15.h }[4], [x26]\n" + "b 68f\n" + "54:" // Height 2: Partial accumulate: partial_2_24 + "tbz x11, #1, 55f\n" + "ldr s11, [x9], #0x4\n" + "ldr s15, [x26], #0x4\n" + "mov x20, #0x34\n" + "tbz x11, #0, 68f\n" + "ld1 { v11.h }[2], [x9]\n" + "ld1 { v15.h }[2], [x26]\n" + "b 68f\n" + "55:" // Height 2: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 68f\n" + "ldr h11, [x9, #0x0]\n" + "ldr h15, [x26, #0x0]\n" + "b 68f\n" + "56:" // Height 2: Partial accumulate: partial_4_16 + "tbz x11, #2, 58f\n" + "ldr d10, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "tbz x11, #1, 57f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v14.s }[2], [x26], #0x4\n" + "mov x20, #0x2c\n" + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[6], [x9]\n" + "ld1 { v14.h }[6], [x26]\n" + "b 68f\n" + "57:" // Height 2: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[4], [x9]\n" + "ld1 { v14.h }[4], [x26]\n" + "b 68f\n" + "58:" // Height 2: Partial accumulate: partial_2_16 + "tbz x11, #1, 59f\n" + "ldr s10, [x9], #0x4\n" + "ldr s14, [x26], #0x4\n" + "mov x20, #0x24\n" + "tbz x11, #0, 68f\n" + "ld1 { v10.h }[2], [x9]\n" + "ld1 { v14.h }[2], [x26]\n" + "b 68f\n" + "59:" // Height 2: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 68f\n" + "ldr h10, [x9, #0x0]\n" + "ldr h14, [x26, #0x0]\n" + "b 68f\n" + "60:" // Height 2: Partial accumulate: partial_8_0 + "tbz x11, #3, 64f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "tbz x11, #2, 62f\n" + "ldr d9, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "tbz x11, #1, 61f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[6], [x9]\n" + "ld1 { v13.h }[6], [x26]\n" + "b 68f\n" + "61:" // Height 2: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[4], [x9]\n" + "ld1 { v13.h }[4], [x26]\n" + "b 68f\n" + "62:" // Height 2: Partial accumulate: partial_2_8 + "tbz x11, #1, 63f\n" + "ldr s9, [x9], #0x4\n" + "ldr s13, [x26], #0x4\n" + "mov x20, #0x14\n" + "tbz x11, #0, 68f\n" + "ld1 { v9.h }[2], [x9]\n" + "ld1 { v13.h }[2], [x26]\n" + "b 68f\n" + "63:" // Height 2: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 68f\n" + "ldr h9, [x9, #0x0]\n" + "ldr h13, [x26, #0x0]\n" + "b 68f\n" + "64:" // Height 2: Partial accumulate: partial_4_0 + "tbz x11, #2, 66f\n" + "ldr d8, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "tbz x11, #1, 65f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x26]\n" + "b 68f\n" + "65:" // Height 2: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x26]\n" + "b 68f\n" + "66:" // Height 2: Partial accumulate: partial_2_0 + "tbz x11, #1, 67f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x26], #0x4\n" + "mov x20, #0x4\n" + "tbz x11, #0, 68f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x26]\n" + "b 68f\n" + "67:" // Height 2: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "68:" // Height 2: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 71f\n" + "69:" // Height 2: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "b 71f\n" + "70:" // Height 2: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "71:" // Height 2: setup done + "mov x28, #0x0\n" + "72:" // Height 2: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 73f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "cbnz x28, 74f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "b 74f\n" + "73:" // Height 2: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "74:" // Height 2: input setup done + "cmp x27, #0x8\n" + "blt 77f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 76f\n" + "75:" // Height 2: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "sub x27, x27, #0x8\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 75b\n" + "76:" // Height 2: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "add x26, x26, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x25, x25, #0x10\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "77:" // Height 2: Multiply loop: Main loop skip + "cbz x27, 79f\n" + "78:" // Height 2: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "cbnz x27, 78b\n" + "79:" // Height 2: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 72b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "tbz %x[flags], #1, 80f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v17.8h }, [x21]\n" + "ld1r { v16.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v17.8h\n" + "fmin v9.8h, v9.8h, v17.8h\n" + "fmin v10.8h, v10.8h, v17.8h\n" + "fmin v11.8h, v11.8h, v17.8h\n" + "fmin v12.8h, v12.8h, v17.8h\n" + "fmin v13.8h, v13.8h, v17.8h\n" + "fmin v14.8h, v14.8h, v17.8h\n" + "fmin v15.8h, v15.8h, v17.8h\n" + "fmax v8.8h, v8.8h, v16.8h\n" + "fmax v9.8h, v9.8h, v16.8h\n" + "fmax v10.8h, v10.8h, v16.8h\n" + "fmax v11.8h, v11.8h, v16.8h\n" + "fmax v12.8h, v12.8h, v16.8h\n" + "fmax v13.8h, v13.8h, v16.8h\n" + "fmax v14.8h, v14.8h, v16.8h\n" + "fmax v15.8h, v15.8h, v16.8h\n" + "80:" // Height 2: No activation + "cmp x11, #0x20\n" + "bge 97f\n" + "tbz x11, #4, 88f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v13.8h }, [x26], #0x10\n" + "tbz x11, #3, 84f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "st1 { v14.8h }, [x26], #0x10\n" + "tbz x11, #2, 82f\n" + "str d11, [x9], #0x8\n" + "str d15, [x26], #0x8\n" + "tbz x11, #1, 81f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v11.h }[6], [x9]\n" + "st1 { v15.h }[6], [x26]\n" + "b 96f\n" + "81:" // Height 2: Partial direct writeback: partial_1_28 + "tbz x11, #0, 96f\n" + "st1 { v11.h }[4], [x9]\n" + "st1 { v15.h }[4], [x26]\n" + "b 96f\n" + "82:" // Height 2: Partial direct writeback: partial_2_24 + "tbz x11, #1, 83f\n" + "str s11, [x9], #0x4\n" + "str s15, [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v11.h }[2], [x9]\n" + "st1 { v15.h }[2], [x26]\n" + "b 96f\n" + "83:" // Height 2: Partial direct writeback: partial_1_24 + "tbz x11, #0, 96f\n" + "str h11, [x9, #0x0]\n" + "str h15, [x26, #0x0]\n" + "b 96f\n" + "84:" // Height 2: Partial direct writeback: partial_4_16 + "tbz x11, #2, 86f\n" + "str d10, [x9], #0x8\n" + "str d14, [x26], #0x8\n" + "tbz x11, #1, 85f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "st1 { v14.s }[2], [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v10.h }[6], [x9]\n" + "st1 { v14.h }[6], [x26]\n" + "b 96f\n" + "85:" // Height 2: Partial direct writeback: partial_1_20 + "tbz x11, #0, 96f\n" + "st1 { v10.h }[4], [x9]\n" + "st1 { v14.h }[4], [x26]\n" + "b 96f\n" + "86:" // Height 2: Partial direct writeback: partial_2_16 + "tbz x11, #1, 87f\n" + "str s10, [x9], #0x4\n" + "str s14, [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v10.h }[2], [x9]\n" + "st1 { v14.h }[2], [x26]\n" + "b 96f\n" + "87:" // Height 2: Partial direct writeback: partial_1_16 + "tbz x11, #0, 96f\n" + "str h10, [x9, #0x0]\n" + "str h14, [x26, #0x0]\n" + "b 96f\n" + "88:" // Height 2: Partial direct writeback: partial_8_0 + "tbz x11, #3, 92f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "tbz x11, #2, 90f\n" + "str d9, [x9], #0x8\n" + "str d13, [x26], #0x8\n" + "tbz x11, #1, 89f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "st1 { v13.s }[2], [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v9.h }[6], [x9]\n" + "st1 { v13.h }[6], [x26]\n" + "b 96f\n" + "89:" // Height 2: Partial direct writeback: partial_1_12 + "tbz x11, #0, 96f\n" + "st1 { v9.h }[4], [x9]\n" + "st1 { v13.h }[4], [x26]\n" + "b 96f\n" + "90:" // Height 2: Partial direct writeback: partial_2_8 + "tbz x11, #1, 91f\n" + "str s9, [x9], #0x4\n" + "str s13, [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v9.h }[2], [x9]\n" + "st1 { v13.h }[2], [x26]\n" + "b 96f\n" + "91:" // Height 2: Partial direct writeback: partial_1_8 + "tbz x11, #0, 96f\n" + "str h9, [x9, #0x0]\n" + "str h13, [x26, #0x0]\n" + "b 96f\n" + "92:" // Height 2: Partial direct writeback: partial_4_0 + "tbz x11, #2, 94f\n" + "str d8, [x9], #0x8\n" + "str d12, [x26], #0x8\n" + "tbz x11, #1, 93f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v8.h }[6], [x9]\n" + "st1 { v12.h }[6], [x26]\n" + "b 96f\n" + "93:" // Height 2: Partial direct writeback: partial_1_4 + "tbz x11, #0, 96f\n" + "st1 { v8.h }[4], [x9]\n" + "st1 { v12.h }[4], [x26]\n" + "b 96f\n" + "94:" // Height 2: Partial direct writeback: partial_2_0 + "tbz x11, #1, 95f\n" + "str s8, [x9], #0x4\n" + "str s12, [x26], #0x4\n" + "tbz x11, #0, 96f\n" + "st1 { v8.h }[2], [x9]\n" + "st1 { v12.h }[2], [x26]\n" + "b 96f\n" + "95:" // Height 2: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "str h12, [x26, #0x0]\n" + "96:" // Height 2: Partial direct writeback: Done + "b 98f\n" + "97:" // Height 2: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q12, [x26, #0x0]\n" + "str q13, [x26, #0x10]\n" + "str q14, [x26, #0x20]\n" + "str q15, [x26, #0x30]\n" + "98:" // Height 2: Writeback done + "subs x11, x11, #0x20\n" + "bgt 51b\n" + "b 296f\n" + "99:" // Height 3 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "100:" // Height 3: Column loop + "cbz x10, 101f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "mov v12.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "b 120f\n" + "101:" // Height 3: no bias + "tbz %x[flags], #0, 119f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "bge 118f\n" + "tbz x11, #4, 109f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "ld1 { v13.8h }, [x26], #0x10\n" + "ld1 { v17.8h }, [x25], #0x10\n" + "tbz x11, #3, 105f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "ld1 { v14.8h }, [x26], #0x10\n" + "ld1 { v18.8h }, [x25], #0x10\n" + "tbz x11, #2, 103f\n" + "ldr d11, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "ldr d19, [x25], #0x8\n" + "tbz x11, #1, 102f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "ld1 { v15.s }[2], [x26], #0x4\n" + "mov x20, #0x3c\n" + "ld1 { v19.s }[2], [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[6], [x9]\n" + "ld1 { v15.h }[6], [x26]\n" + "ld1 { v19.h }[6], [x25]\n" + "b 117f\n" + "102:" // Height 3: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[4], [x9]\n" + "ld1 { v15.h }[4], [x26]\n" + "ld1 { v19.h }[4], [x25]\n" + "b 117f\n" + "103:" // Height 3: Partial accumulate: partial_2_24 + "tbz x11, #1, 104f\n" + "ldr s11, [x9], #0x4\n" + "ldr s15, [x26], #0x4\n" + "mov x20, #0x34\n" + "ldr s19, [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v11.h }[2], [x9]\n" + "ld1 { v15.h }[2], [x26]\n" + "ld1 { v19.h }[2], [x25]\n" + "b 117f\n" + "104:" // Height 3: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 117f\n" + "ldr h11, [x9, #0x0]\n" + "ldr h15, [x26, #0x0]\n" + "ldr h19, [x25, #0x0]\n" + "b 117f\n" + "105:" // Height 3: Partial accumulate: partial_4_16 + "tbz x11, #2, 107f\n" + "ldr d10, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "tbz x11, #1, 106f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v14.s }[2], [x26], #0x4\n" + "mov x20, #0x2c\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[6], [x9]\n" + "ld1 { v14.h }[6], [x26]\n" + "ld1 { v18.h }[6], [x25]\n" + "b 117f\n" + "106:" // Height 3: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[4], [x9]\n" + "ld1 { v14.h }[4], [x26]\n" + "ld1 { v18.h }[4], [x25]\n" + "b 117f\n" + "107:" // Height 3: Partial accumulate: partial_2_16 + "tbz x11, #1, 108f\n" + "ldr s10, [x9], #0x4\n" + "ldr s14, [x26], #0x4\n" + "mov x20, #0x24\n" + "ldr s18, [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v10.h }[2], [x9]\n" + "ld1 { v14.h }[2], [x26]\n" + "ld1 { v18.h }[2], [x25]\n" + "b 117f\n" + "108:" // Height 3: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 117f\n" + "ldr h10, [x9, #0x0]\n" + "ldr h14, [x26, #0x0]\n" + "ldr h18, [x25, #0x0]\n" + "b 117f\n" + "109:" // Height 3: Partial accumulate: partial_8_0 + "tbz x11, #3, 113f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "tbz x11, #2, 111f\n" + "ldr d9, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "tbz x11, #1, 110f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[6], [x9]\n" + "ld1 { v13.h }[6], [x26]\n" + "ld1 { v17.h }[6], [x25]\n" + "b 117f\n" + "110:" // Height 3: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[4], [x9]\n" + "ld1 { v13.h }[4], [x26]\n" + "ld1 { v17.h }[4], [x25]\n" + "b 117f\n" + "111:" // Height 3: Partial accumulate: partial_2_8 + "tbz x11, #1, 112f\n" + "ldr s9, [x9], #0x4\n" + "ldr s13, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s17, [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v9.h }[2], [x9]\n" + "ld1 { v13.h }[2], [x26]\n" + "ld1 { v17.h }[2], [x25]\n" + "b 117f\n" + "112:" // Height 3: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 117f\n" + "ldr h9, [x9, #0x0]\n" + "ldr h13, [x26, #0x0]\n" + "ldr h17, [x25, #0x0]\n" + "b 117f\n" + "113:" // Height 3: Partial accumulate: partial_4_0 + "tbz x11, #2, 115f\n" + "ldr d8, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "tbz x11, #1, 114f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v16.s }[2], [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x26]\n" + "ld1 { v16.h }[6], [x25]\n" + "b 117f\n" + "114:" // Height 3: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x26]\n" + "ld1 { v16.h }[4], [x25]\n" + "b 117f\n" + "115:" // Height 3: Partial accumulate: partial_2_0 + "tbz x11, #1, 116f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s16, [x25], #0x4\n" + "tbz x11, #0, 117f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x26]\n" + "ld1 { v16.h }[2], [x25]\n" + "b 117f\n" + "116:" // Height 3: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x25, #0x0]\n" + "117:" // Height 3: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 120f\n" + "118:" // Height 3: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q16, [x25, #0x0]\n" + "ldr q17, [x25, #0x10]\n" + "ldr q18, [x25, #0x20]\n" + "ldr q19, [x25, #0x30]\n" + "b 120f\n" + "119:" // Height 3: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "120:" // Height 3: setup done + "mov x28, #0x0\n" + "121:" // Height 3: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 122f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "cbnz x28, 123f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "b 123f\n" + "122:" // Height 3: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "123:" // Height 3: input setup done + "cmp x27, #0x8\n" + "blt 126f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 125f\n" + "124:" // Height 3: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x25, x25, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x24, x24, #0x10\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 124b\n" + "125:" // Height 3: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x24, x24, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "126:" // Height 3: Multiply loop: Main loop skip + "cbz x27, 128f\n" + "127:" // Height 3: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "cbnz x27, 127b\n" + "128:" // Height 3: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 121b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "tbz %x[flags], #1, 129f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v21.8h }, [x21]\n" + "ld1r { v20.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v21.8h\n" + "fmin v9.8h, v9.8h, v21.8h\n" + "fmin v10.8h, v10.8h, v21.8h\n" + "fmin v11.8h, v11.8h, v21.8h\n" + "fmin v12.8h, v12.8h, v21.8h\n" + "fmin v13.8h, v13.8h, v21.8h\n" + "fmin v14.8h, v14.8h, v21.8h\n" + "fmin v15.8h, v15.8h, v21.8h\n" + "fmin v16.8h, v16.8h, v21.8h\n" + "fmin v17.8h, v17.8h, v21.8h\n" + "fmin v18.8h, v18.8h, v21.8h\n" + "fmin v19.8h, v19.8h, v21.8h\n" + "fmax v8.8h, v8.8h, v20.8h\n" + "fmax v9.8h, v9.8h, v20.8h\n" + "fmax v10.8h, v10.8h, v20.8h\n" + "fmax v11.8h, v11.8h, v20.8h\n" + "fmax v12.8h, v12.8h, v20.8h\n" + "fmax v13.8h, v13.8h, v20.8h\n" + "fmax v14.8h, v14.8h, v20.8h\n" + "fmax v15.8h, v15.8h, v20.8h\n" + "fmax v16.8h, v16.8h, v20.8h\n" + "fmax v17.8h, v17.8h, v20.8h\n" + "fmax v18.8h, v18.8h, v20.8h\n" + "fmax v19.8h, v19.8h, v20.8h\n" + "129:" // Height 3: No activation + "cmp x11, #0x20\n" + "bge 146f\n" + "tbz x11, #4, 137f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v13.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v17.8h }, [x25], #0x10\n" + "tbz x11, #3, 133f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "st1 { v14.8h }, [x26], #0x10\n" + "st1 { v18.8h }, [x25], #0x10\n" + "tbz x11, #2, 131f\n" + "str d11, [x9], #0x8\n" + "str d15, [x26], #0x8\n" + "str d19, [x25], #0x8\n" + "tbz x11, #1, 130f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v11.h }[6], [x9]\n" + "st1 { v15.h }[6], [x26]\n" + "st1 { v19.h }[6], [x25]\n" + "b 145f\n" + "130:" // Height 3: Partial direct writeback: partial_1_28 + "tbz x11, #0, 145f\n" + "st1 { v11.h }[4], [x9]\n" + "st1 { v15.h }[4], [x26]\n" + "st1 { v19.h }[4], [x25]\n" + "b 145f\n" + "131:" // Height 3: Partial direct writeback: partial_2_24 + "tbz x11, #1, 132f\n" + "str s11, [x9], #0x4\n" + "str s15, [x26], #0x4\n" + "str s19, [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v11.h }[2], [x9]\n" + "st1 { v15.h }[2], [x26]\n" + "st1 { v19.h }[2], [x25]\n" + "b 145f\n" + "132:" // Height 3: Partial direct writeback: partial_1_24 + "tbz x11, #0, 145f\n" + "str h11, [x9, #0x0]\n" + "str h15, [x26, #0x0]\n" + "str h19, [x25, #0x0]\n" + "b 145f\n" + "133:" // Height 3: Partial direct writeback: partial_4_16 + "tbz x11, #2, 135f\n" + "str d10, [x9], #0x8\n" + "str d14, [x26], #0x8\n" + "str d18, [x25], #0x8\n" + "tbz x11, #1, 134f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "st1 { v14.s }[2], [x26], #0x4\n" + "st1 { v18.s }[2], [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v10.h }[6], [x9]\n" + "st1 { v14.h }[6], [x26]\n" + "st1 { v18.h }[6], [x25]\n" + "b 145f\n" + "134:" // Height 3: Partial direct writeback: partial_1_20 + "tbz x11, #0, 145f\n" + "st1 { v10.h }[4], [x9]\n" + "st1 { v14.h }[4], [x26]\n" + "st1 { v18.h }[4], [x25]\n" + "b 145f\n" + "135:" // Height 3: Partial direct writeback: partial_2_16 + "tbz x11, #1, 136f\n" + "str s10, [x9], #0x4\n" + "str s14, [x26], #0x4\n" + "str s18, [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v10.h }[2], [x9]\n" + "st1 { v14.h }[2], [x26]\n" + "st1 { v18.h }[2], [x25]\n" + "b 145f\n" + "136:" // Height 3: Partial direct writeback: partial_1_16 + "tbz x11, #0, 145f\n" + "str h10, [x9, #0x0]\n" + "str h14, [x26, #0x0]\n" + "str h18, [x25, #0x0]\n" + "b 145f\n" + "137:" // Height 3: Partial direct writeback: partial_8_0 + "tbz x11, #3, 141f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "tbz x11, #2, 139f\n" + "str d9, [x9], #0x8\n" + "str d13, [x26], #0x8\n" + "str d17, [x25], #0x8\n" + "tbz x11, #1, 138f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "st1 { v13.s }[2], [x26], #0x4\n" + "st1 { v17.s }[2], [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v9.h }[6], [x9]\n" + "st1 { v13.h }[6], [x26]\n" + "st1 { v17.h }[6], [x25]\n" + "b 145f\n" + "138:" // Height 3: Partial direct writeback: partial_1_12 + "tbz x11, #0, 145f\n" + "st1 { v9.h }[4], [x9]\n" + "st1 { v13.h }[4], [x26]\n" + "st1 { v17.h }[4], [x25]\n" + "b 145f\n" + "139:" // Height 3: Partial direct writeback: partial_2_8 + "tbz x11, #1, 140f\n" + "str s9, [x9], #0x4\n" + "str s13, [x26], #0x4\n" + "str s17, [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v9.h }[2], [x9]\n" + "st1 { v13.h }[2], [x26]\n" + "st1 { v17.h }[2], [x25]\n" + "b 145f\n" + "140:" // Height 3: Partial direct writeback: partial_1_8 + "tbz x11, #0, 145f\n" + "str h9, [x9, #0x0]\n" + "str h13, [x26, #0x0]\n" + "str h17, [x25, #0x0]\n" + "b 145f\n" + "141:" // Height 3: Partial direct writeback: partial_4_0 + "tbz x11, #2, 143f\n" + "str d8, [x9], #0x8\n" + "str d12, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "tbz x11, #1, 142f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v8.h }[6], [x9]\n" + "st1 { v12.h }[6], [x26]\n" + "st1 { v16.h }[6], [x25]\n" + "b 145f\n" + "142:" // Height 3: Partial direct writeback: partial_1_4 + "tbz x11, #0, 145f\n" + "st1 { v8.h }[4], [x9]\n" + "st1 { v12.h }[4], [x26]\n" + "st1 { v16.h }[4], [x25]\n" + "b 145f\n" + "143:" // Height 3: Partial direct writeback: partial_2_0 + "tbz x11, #1, 144f\n" + "str s8, [x9], #0x4\n" + "str s12, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "tbz x11, #0, 145f\n" + "st1 { v8.h }[2], [x9]\n" + "st1 { v12.h }[2], [x26]\n" + "st1 { v16.h }[2], [x25]\n" + "b 145f\n" + "144:" // Height 3: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "str h12, [x26, #0x0]\n" + "str h16, [x25, #0x0]\n" + "145:" // Height 3: Partial direct writeback: Done + "b 147f\n" + "146:" // Height 3: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q12, [x26, #0x0]\n" + "str q13, [x26, #0x10]\n" + "str q14, [x26, #0x20]\n" + "str q15, [x26, #0x30]\n" + "str q16, [x25, #0x0]\n" + "str q17, [x25, #0x10]\n" + "str q18, [x25, #0x20]\n" + "str q19, [x25, #0x30]\n" + "147:" // Height 3: Writeback done + "subs x11, x11, #0x20\n" + "bgt 100b\n" + "b 296f\n" + "148:" // Height 4 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "149:" // Height 4: Column loop + "cbz x10, 150f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "mov v12.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v20.16b, v8.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "b 169f\n" + "150:" // Height 4: no bias + "tbz %x[flags], #0, 168f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "bge 167f\n" + "tbz x11, #4, 158f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "ld1 { v13.8h }, [x26], #0x10\n" + "ld1 { v17.8h }, [x25], #0x10\n" + "ld1 { v21.8h }, [x24], #0x10\n" + "tbz x11, #3, 154f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "ld1 { v14.8h }, [x26], #0x10\n" + "ld1 { v18.8h }, [x25], #0x10\n" + "ld1 { v22.8h }, [x24], #0x10\n" + "tbz x11, #2, 152f\n" + "ldr d11, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "ldr d19, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "tbz x11, #1, 151f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "ld1 { v15.s }[2], [x26], #0x4\n" + "mov x20, #0x3c\n" + "ld1 { v19.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[6], [x9]\n" + "ld1 { v15.h }[6], [x26]\n" + "ld1 { v19.h }[6], [x25]\n" + "ld1 { v23.h }[6], [x24]\n" + "b 166f\n" + "151:" // Height 4: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[4], [x9]\n" + "ld1 { v15.h }[4], [x26]\n" + "ld1 { v19.h }[4], [x25]\n" + "ld1 { v23.h }[4], [x24]\n" + "b 166f\n" + "152:" // Height 4: Partial accumulate: partial_2_24 + "tbz x11, #1, 153f\n" + "ldr s11, [x9], #0x4\n" + "ldr s15, [x26], #0x4\n" + "mov x20, #0x34\n" + "ldr s19, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v11.h }[2], [x9]\n" + "ld1 { v15.h }[2], [x26]\n" + "ld1 { v19.h }[2], [x25]\n" + "ld1 { v23.h }[2], [x24]\n" + "b 166f\n" + "153:" // Height 4: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 166f\n" + "ldr h11, [x9, #0x0]\n" + "ldr h15, [x26, #0x0]\n" + "ldr h19, [x25, #0x0]\n" + "ldr h23, [x24, #0x0]\n" + "b 166f\n" + "154:" // Height 4: Partial accumulate: partial_4_16 + "tbz x11, #2, 156f\n" + "ldr d10, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "tbz x11, #1, 155f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v14.s }[2], [x26], #0x4\n" + "mov x20, #0x2c\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v22.s }[2], [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[6], [x9]\n" + "ld1 { v14.h }[6], [x26]\n" + "ld1 { v18.h }[6], [x25]\n" + "ld1 { v22.h }[6], [x24]\n" + "b 166f\n" + "155:" // Height 4: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[4], [x9]\n" + "ld1 { v14.h }[4], [x26]\n" + "ld1 { v18.h }[4], [x25]\n" + "ld1 { v22.h }[4], [x24]\n" + "b 166f\n" + "156:" // Height 4: Partial accumulate: partial_2_16 + "tbz x11, #1, 157f\n" + "ldr s10, [x9], #0x4\n" + "ldr s14, [x26], #0x4\n" + "mov x20, #0x24\n" + "ldr s18, [x25], #0x4\n" + "ldr s22, [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v10.h }[2], [x9]\n" + "ld1 { v14.h }[2], [x26]\n" + "ld1 { v18.h }[2], [x25]\n" + "ld1 { v22.h }[2], [x24]\n" + "b 166f\n" + "157:" // Height 4: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 166f\n" + "ldr h10, [x9, #0x0]\n" + "ldr h14, [x26, #0x0]\n" + "ldr h18, [x25, #0x0]\n" + "ldr h22, [x24, #0x0]\n" + "b 166f\n" + "158:" // Height 4: Partial accumulate: partial_8_0 + "tbz x11, #3, 162f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "tbz x11, #2, 160f\n" + "ldr d9, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "tbz x11, #1, 159f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[6], [x9]\n" + "ld1 { v13.h }[6], [x26]\n" + "ld1 { v17.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "b 166f\n" + "159:" // Height 4: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[4], [x9]\n" + "ld1 { v13.h }[4], [x26]\n" + "ld1 { v17.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "b 166f\n" + "160:" // Height 4: Partial accumulate: partial_2_8 + "tbz x11, #1, 161f\n" + "ldr s9, [x9], #0x4\n" + "ldr s13, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s17, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v9.h }[2], [x9]\n" + "ld1 { v13.h }[2], [x26]\n" + "ld1 { v17.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "b 166f\n" + "161:" // Height 4: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 166f\n" + "ldr h9, [x9, #0x0]\n" + "ldr h13, [x26, #0x0]\n" + "ldr h17, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "b 166f\n" + "162:" // Height 4: Partial accumulate: partial_4_0 + "tbz x11, #2, 164f\n" + "ldr d8, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "tbz x11, #1, 163f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v16.s }[2], [x25], #0x4\n" + "ld1 { v20.s }[2], [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x26]\n" + "ld1 { v16.h }[6], [x25]\n" + "ld1 { v20.h }[6], [x24]\n" + "b 166f\n" + "163:" // Height 4: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x26]\n" + "ld1 { v16.h }[4], [x25]\n" + "ld1 { v20.h }[4], [x24]\n" + "b 166f\n" + "164:" // Height 4: Partial accumulate: partial_2_0 + "tbz x11, #1, 165f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s16, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "tbz x11, #0, 166f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x26]\n" + "ld1 { v16.h }[2], [x25]\n" + "ld1 { v20.h }[2], [x24]\n" + "b 166f\n" + "165:" // Height 4: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x25, #0x0]\n" + "ldr h20, [x24, #0x0]\n" + "166:" // Height 4: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 169f\n" + "167:" // Height 4: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q16, [x25, #0x0]\n" + "ldr q17, [x25, #0x10]\n" + "ldr q18, [x25, #0x20]\n" + "ldr q19, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "b 169f\n" + "168:" // Height 4: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "169:" // Height 4: setup done + "mov x28, #0x0\n" + "170:" // Height 4: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 171f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "cbnz x28, 172f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "b 172f\n" + "171:" // Height 4: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "172:" // Height 4: input setup done + "cmp x27, #0x8\n" + "blt 175f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 174f\n" + "173:" // Height 4: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "add x25, x25, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "cmp x27, #0x10\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 173b\n" + "174:" // Height 4: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "add x24, x24, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x23, x23, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "175:" // Height 4: Multiply loop: Main loop skip + "cbz x27, 177f\n" + "176:" // Height 4: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "cbnz x27, 176b\n" + "177:" // Height 4: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 170b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "tbz %x[flags], #1, 178f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v25.8h }, [x21]\n" + "ld1r { v24.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v25.8h\n" + "fmin v9.8h, v9.8h, v25.8h\n" + "fmin v10.8h, v10.8h, v25.8h\n" + "fmin v11.8h, v11.8h, v25.8h\n" + "fmin v12.8h, v12.8h, v25.8h\n" + "fmin v13.8h, v13.8h, v25.8h\n" + "fmin v14.8h, v14.8h, v25.8h\n" + "fmin v15.8h, v15.8h, v25.8h\n" + "fmin v16.8h, v16.8h, v25.8h\n" + "fmin v17.8h, v17.8h, v25.8h\n" + "fmin v18.8h, v18.8h, v25.8h\n" + "fmin v19.8h, v19.8h, v25.8h\n" + "fmin v20.8h, v20.8h, v25.8h\n" + "fmin v21.8h, v21.8h, v25.8h\n" + "fmin v22.8h, v22.8h, v25.8h\n" + "fmin v23.8h, v23.8h, v25.8h\n" + "fmax v8.8h, v8.8h, v24.8h\n" + "fmax v9.8h, v9.8h, v24.8h\n" + "fmax v10.8h, v10.8h, v24.8h\n" + "fmax v11.8h, v11.8h, v24.8h\n" + "fmax v12.8h, v12.8h, v24.8h\n" + "fmax v13.8h, v13.8h, v24.8h\n" + "fmax v14.8h, v14.8h, v24.8h\n" + "fmax v15.8h, v15.8h, v24.8h\n" + "fmax v16.8h, v16.8h, v24.8h\n" + "fmax v17.8h, v17.8h, v24.8h\n" + "fmax v18.8h, v18.8h, v24.8h\n" + "fmax v19.8h, v19.8h, v24.8h\n" + "fmax v20.8h, v20.8h, v24.8h\n" + "fmax v21.8h, v21.8h, v24.8h\n" + "fmax v22.8h, v22.8h, v24.8h\n" + "fmax v23.8h, v23.8h, v24.8h\n" + "178:" // Height 4: No activation + "cmp x11, #0x20\n" + "bge 195f\n" + "tbz x11, #4, 186f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v13.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v17.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "st1 { v21.8h }, [x24], #0x10\n" + "tbz x11, #3, 182f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "st1 { v14.8h }, [x26], #0x10\n" + "st1 { v18.8h }, [x25], #0x10\n" + "st1 { v22.8h }, [x24], #0x10\n" + "tbz x11, #2, 180f\n" + "str d11, [x9], #0x8\n" + "str d15, [x26], #0x8\n" + "str d19, [x25], #0x8\n" + "str d23, [x24], #0x8\n" + "tbz x11, #1, 179f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x25], #0x4\n" + "st1 { v23.s }[2], [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v11.h }[6], [x9]\n" + "st1 { v15.h }[6], [x26]\n" + "st1 { v19.h }[6], [x25]\n" + "st1 { v23.h }[6], [x24]\n" + "b 194f\n" + "179:" // Height 4: Partial direct writeback: partial_1_28 + "tbz x11, #0, 194f\n" + "st1 { v11.h }[4], [x9]\n" + "st1 { v15.h }[4], [x26]\n" + "st1 { v19.h }[4], [x25]\n" + "st1 { v23.h }[4], [x24]\n" + "b 194f\n" + "180:" // Height 4: Partial direct writeback: partial_2_24 + "tbz x11, #1, 181f\n" + "str s11, [x9], #0x4\n" + "str s15, [x26], #0x4\n" + "str s19, [x25], #0x4\n" + "str s23, [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v11.h }[2], [x9]\n" + "st1 { v15.h }[2], [x26]\n" + "st1 { v19.h }[2], [x25]\n" + "st1 { v23.h }[2], [x24]\n" + "b 194f\n" + "181:" // Height 4: Partial direct writeback: partial_1_24 + "tbz x11, #0, 194f\n" + "str h11, [x9, #0x0]\n" + "str h15, [x26, #0x0]\n" + "str h19, [x25, #0x0]\n" + "str h23, [x24, #0x0]\n" + "b 194f\n" + "182:" // Height 4: Partial direct writeback: partial_4_16 + "tbz x11, #2, 184f\n" + "str d10, [x9], #0x8\n" + "str d14, [x26], #0x8\n" + "str d18, [x25], #0x8\n" + "str d22, [x24], #0x8\n" + "tbz x11, #1, 183f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "st1 { v14.s }[2], [x26], #0x4\n" + "st1 { v18.s }[2], [x25], #0x4\n" + "st1 { v22.s }[2], [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v10.h }[6], [x9]\n" + "st1 { v14.h }[6], [x26]\n" + "st1 { v18.h }[6], [x25]\n" + "st1 { v22.h }[6], [x24]\n" + "b 194f\n" + "183:" // Height 4: Partial direct writeback: partial_1_20 + "tbz x11, #0, 194f\n" + "st1 { v10.h }[4], [x9]\n" + "st1 { v14.h }[4], [x26]\n" + "st1 { v18.h }[4], [x25]\n" + "st1 { v22.h }[4], [x24]\n" + "b 194f\n" + "184:" // Height 4: Partial direct writeback: partial_2_16 + "tbz x11, #1, 185f\n" + "str s10, [x9], #0x4\n" + "str s14, [x26], #0x4\n" + "str s18, [x25], #0x4\n" + "str s22, [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v10.h }[2], [x9]\n" + "st1 { v14.h }[2], [x26]\n" + "st1 { v18.h }[2], [x25]\n" + "st1 { v22.h }[2], [x24]\n" + "b 194f\n" + "185:" // Height 4: Partial direct writeback: partial_1_16 + "tbz x11, #0, 194f\n" + "str h10, [x9, #0x0]\n" + "str h14, [x26, #0x0]\n" + "str h18, [x25, #0x0]\n" + "str h22, [x24, #0x0]\n" + "b 194f\n" + "186:" // Height 4: Partial direct writeback: partial_8_0 + "tbz x11, #3, 190f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "tbz x11, #2, 188f\n" + "str d9, [x9], #0x8\n" + "str d13, [x26], #0x8\n" + "str d17, [x25], #0x8\n" + "str d21, [x24], #0x8\n" + "tbz x11, #1, 187f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "st1 { v13.s }[2], [x26], #0x4\n" + "st1 { v17.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v9.h }[6], [x9]\n" + "st1 { v13.h }[6], [x26]\n" + "st1 { v17.h }[6], [x25]\n" + "st1 { v21.h }[6], [x24]\n" + "b 194f\n" + "187:" // Height 4: Partial direct writeback: partial_1_12 + "tbz x11, #0, 194f\n" + "st1 { v9.h }[4], [x9]\n" + "st1 { v13.h }[4], [x26]\n" + "st1 { v17.h }[4], [x25]\n" + "st1 { v21.h }[4], [x24]\n" + "b 194f\n" + "188:" // Height 4: Partial direct writeback: partial_2_8 + "tbz x11, #1, 189f\n" + "str s9, [x9], #0x4\n" + "str s13, [x26], #0x4\n" + "str s17, [x25], #0x4\n" + "str s21, [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v9.h }[2], [x9]\n" + "st1 { v13.h }[2], [x26]\n" + "st1 { v17.h }[2], [x25]\n" + "st1 { v21.h }[2], [x24]\n" + "b 194f\n" + "189:" // Height 4: Partial direct writeback: partial_1_8 + "tbz x11, #0, 194f\n" + "str h9, [x9, #0x0]\n" + "str h13, [x26, #0x0]\n" + "str h17, [x25, #0x0]\n" + "str h21, [x24, #0x0]\n" + "b 194f\n" + "190:" // Height 4: Partial direct writeback: partial_4_0 + "tbz x11, #2, 192f\n" + "str d8, [x9], #0x8\n" + "str d12, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "str d20, [x24], #0x8\n" + "tbz x11, #1, 191f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "st1 { v20.s }[2], [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v8.h }[6], [x9]\n" + "st1 { v12.h }[6], [x26]\n" + "st1 { v16.h }[6], [x25]\n" + "st1 { v20.h }[6], [x24]\n" + "b 194f\n" + "191:" // Height 4: Partial direct writeback: partial_1_4 + "tbz x11, #0, 194f\n" + "st1 { v8.h }[4], [x9]\n" + "st1 { v12.h }[4], [x26]\n" + "st1 { v16.h }[4], [x25]\n" + "st1 { v20.h }[4], [x24]\n" + "b 194f\n" + "192:" // Height 4: Partial direct writeback: partial_2_0 + "tbz x11, #1, 193f\n" + "str s8, [x9], #0x4\n" + "str s12, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "str s20, [x24], #0x4\n" + "tbz x11, #0, 194f\n" + "st1 { v8.h }[2], [x9]\n" + "st1 { v12.h }[2], [x26]\n" + "st1 { v16.h }[2], [x25]\n" + "st1 { v20.h }[2], [x24]\n" + "b 194f\n" + "193:" // Height 4: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "str h12, [x26, #0x0]\n" + "str h16, [x25, #0x0]\n" + "str h20, [x24, #0x0]\n" + "194:" // Height 4: Partial direct writeback: Done + "b 196f\n" + "195:" // Height 4: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q12, [x26, #0x0]\n" + "str q13, [x26, #0x10]\n" + "str q14, [x26, #0x20]\n" + "str q15, [x26, #0x30]\n" + "str q16, [x25, #0x0]\n" + "str q17, [x25, #0x10]\n" + "str q18, [x25, #0x20]\n" + "str q19, [x25, #0x30]\n" + "str q20, [x24, #0x0]\n" + "str q21, [x24, #0x10]\n" + "str q22, [x24, #0x20]\n" + "str q23, [x24, #0x30]\n" + "196:" // Height 4: Writeback done + "subs x11, x11, #0x20\n" + "bgt 149b\n" + "b 296f\n" + "197:" // Height 5 + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "198:" // Height 5: Column loop + "cbz x10, 199f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "mov v12.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v20.16b, v8.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "b 218f\n" + "199:" // Height 5: no bias + "tbz %x[flags], #0, 217f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "add x23, x24, x20, LSL #1\n" + "bge 216f\n" + "tbz x11, #4, 207f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "ld1 { v13.8h }, [x26], #0x10\n" + "ld1 { v17.8h }, [x25], #0x10\n" + "ld1 { v21.8h }, [x24], #0x10\n" + "ld1 { v25.8h }, [x23], #0x10\n" + "tbz x11, #3, 203f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "ld1 { v14.8h }, [x26], #0x10\n" + "ld1 { v18.8h }, [x25], #0x10\n" + "ld1 { v22.8h }, [x24], #0x10\n" + "ld1 { v26.8h }, [x23], #0x10\n" + "tbz x11, #2, 201f\n" + "ldr d11, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "ldr d19, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "tbz x11, #1, 200f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "ld1 { v15.s }[2], [x26], #0x4\n" + "mov x20, #0x3c\n" + "ld1 { v19.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[6], [x9]\n" + "ld1 { v15.h }[6], [x26]\n" + "ld1 { v19.h }[6], [x25]\n" + "ld1 { v23.h }[6], [x24]\n" + "ld1 { v27.h }[6], [x23]\n" + "b 215f\n" + "200:" // Height 5: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[4], [x9]\n" + "ld1 { v15.h }[4], [x26]\n" + "ld1 { v19.h }[4], [x25]\n" + "ld1 { v23.h }[4], [x24]\n" + "ld1 { v27.h }[4], [x23]\n" + "b 215f\n" + "201:" // Height 5: Partial accumulate: partial_2_24 + "tbz x11, #1, 202f\n" + "ldr s11, [x9], #0x4\n" + "ldr s15, [x26], #0x4\n" + "mov x20, #0x34\n" + "ldr s19, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v11.h }[2], [x9]\n" + "ld1 { v15.h }[2], [x26]\n" + "ld1 { v19.h }[2], [x25]\n" + "ld1 { v23.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "b 215f\n" + "202:" // Height 5: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 215f\n" + "ldr h11, [x9, #0x0]\n" + "ldr h15, [x26, #0x0]\n" + "ldr h19, [x25, #0x0]\n" + "ldr h23, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "b 215f\n" + "203:" // Height 5: Partial accumulate: partial_4_16 + "tbz x11, #2, 205f\n" + "ldr d10, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "tbz x11, #1, 204f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v14.s }[2], [x26], #0x4\n" + "mov x20, #0x2c\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v22.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[6], [x9]\n" + "ld1 { v14.h }[6], [x26]\n" + "ld1 { v18.h }[6], [x25]\n" + "ld1 { v22.h }[6], [x24]\n" + "ld1 { v26.h }[6], [x23]\n" + "b 215f\n" + "204:" // Height 5: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[4], [x9]\n" + "ld1 { v14.h }[4], [x26]\n" + "ld1 { v18.h }[4], [x25]\n" + "ld1 { v22.h }[4], [x24]\n" + "ld1 { v26.h }[4], [x23]\n" + "b 215f\n" + "205:" // Height 5: Partial accumulate: partial_2_16 + "tbz x11, #1, 206f\n" + "ldr s10, [x9], #0x4\n" + "ldr s14, [x26], #0x4\n" + "mov x20, #0x24\n" + "ldr s18, [x25], #0x4\n" + "ldr s22, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v10.h }[2], [x9]\n" + "ld1 { v14.h }[2], [x26]\n" + "ld1 { v18.h }[2], [x25]\n" + "ld1 { v22.h }[2], [x24]\n" + "ld1 { v26.h }[2], [x23]\n" + "b 215f\n" + "206:" // Height 5: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 215f\n" + "ldr h10, [x9, #0x0]\n" + "ldr h14, [x26, #0x0]\n" + "ldr h18, [x25, #0x0]\n" + "ldr h22, [x24, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "b 215f\n" + "207:" // Height 5: Partial accumulate: partial_8_0 + "tbz x11, #3, 211f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "tbz x11, #2, 209f\n" + "ldr d9, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "tbz x11, #1, 208f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[6], [x9]\n" + "ld1 { v13.h }[6], [x26]\n" + "ld1 { v17.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" + "b 215f\n" + "208:" // Height 5: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[4], [x9]\n" + "ld1 { v13.h }[4], [x26]\n" + "ld1 { v17.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" + "b 215f\n" + "209:" // Height 5: Partial accumulate: partial_2_8 + "tbz x11, #1, 210f\n" + "ldr s9, [x9], #0x4\n" + "ldr s13, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s17, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v9.h }[2], [x9]\n" + "ld1 { v13.h }[2], [x26]\n" + "ld1 { v17.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "b 215f\n" + "210:" // Height 5: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 215f\n" + "ldr h9, [x9, #0x0]\n" + "ldr h13, [x26, #0x0]\n" + "ldr h17, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "b 215f\n" + "211:" // Height 5: Partial accumulate: partial_4_0 + "tbz x11, #2, 213f\n" + "ldr d8, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "tbz x11, #1, 212f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v16.s }[2], [x25], #0x4\n" + "ld1 { v20.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x26]\n" + "ld1 { v16.h }[6], [x25]\n" + "ld1 { v20.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "b 215f\n" + "212:" // Height 5: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x26]\n" + "ld1 { v16.h }[4], [x25]\n" + "ld1 { v20.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "b 215f\n" + "213:" // Height 5: Partial accumulate: partial_2_0 + "tbz x11, #1, 214f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s16, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "tbz x11, #0, 215f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x26]\n" + "ld1 { v16.h }[2], [x25]\n" + "ld1 { v20.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "b 215f\n" + "214:" // Height 5: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x25, #0x0]\n" + "ldr h20, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "215:" // Height 5: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 218f\n" + "216:" // Height 5: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q16, [x25, #0x0]\n" + "ldr q17, [x25, #0x10]\n" + "ldr q18, [x25, #0x20]\n" + "ldr q19, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "b 218f\n" + "217:" // Height 5: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "218:" // Height 5: setup done + "mov x28, #0x0\n" + "219:" // Height 5: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 220f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "cbnz x28, 221f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "b 221f\n" + "220:" // Height 5: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "221:" // Height 5: input setup done + "cmp x27, #0x8\n" + "blt 224f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 223f\n" + "222:" // Height 5: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x23, x23, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "add x22, x22, #0x10\n" + "cmp x27, #0x10\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 222b\n" + "223:" // Height 5: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "add x22, x22, #0x10\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "prfm pldl1keep, [x26, #0x80]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "sub x27, x27, #0x8\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "224:" // Height 5: Multiply loop: Main loop skip + "cbz x27, 226f\n" + "225:" // Height 5: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "cbnz x27, 225b\n" + "226:" // Height 5: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 219b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x20, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "tbz %x[flags], #1, 227f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v29.8h }, [x21]\n" + "ld1r { v28.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v29.8h\n" + "fmin v9.8h, v9.8h, v29.8h\n" + "fmin v10.8h, v10.8h, v29.8h\n" + "fmin v11.8h, v11.8h, v29.8h\n" + "fmin v12.8h, v12.8h, v29.8h\n" + "fmin v13.8h, v13.8h, v29.8h\n" + "fmin v14.8h, v14.8h, v29.8h\n" + "fmin v15.8h, v15.8h, v29.8h\n" + "fmin v16.8h, v16.8h, v29.8h\n" + "fmin v17.8h, v17.8h, v29.8h\n" + "fmin v18.8h, v18.8h, v29.8h\n" + "fmin v19.8h, v19.8h, v29.8h\n" + "fmin v20.8h, v20.8h, v29.8h\n" + "fmin v21.8h, v21.8h, v29.8h\n" + "fmin v22.8h, v22.8h, v29.8h\n" + "fmin v23.8h, v23.8h, v29.8h\n" + "fmin v24.8h, v24.8h, v29.8h\n" + "fmin v25.8h, v25.8h, v29.8h\n" + "fmin v26.8h, v26.8h, v29.8h\n" + "fmin v27.8h, v27.8h, v29.8h\n" + "fmax v8.8h, v8.8h, v28.8h\n" + "fmax v9.8h, v9.8h, v28.8h\n" + "fmax v10.8h, v10.8h, v28.8h\n" + "fmax v11.8h, v11.8h, v28.8h\n" + "fmax v12.8h, v12.8h, v28.8h\n" + "fmax v13.8h, v13.8h, v28.8h\n" + "fmax v14.8h, v14.8h, v28.8h\n" + "fmax v15.8h, v15.8h, v28.8h\n" + "fmax v16.8h, v16.8h, v28.8h\n" + "fmax v17.8h, v17.8h, v28.8h\n" + "fmax v18.8h, v18.8h, v28.8h\n" + "fmax v19.8h, v19.8h, v28.8h\n" + "fmax v20.8h, v20.8h, v28.8h\n" + "fmax v21.8h, v21.8h, v28.8h\n" + "fmax v22.8h, v22.8h, v28.8h\n" + "fmax v23.8h, v23.8h, v28.8h\n" + "fmax v24.8h, v24.8h, v28.8h\n" + "fmax v25.8h, v25.8h, v28.8h\n" + "fmax v26.8h, v26.8h, v28.8h\n" + "fmax v27.8h, v27.8h, v28.8h\n" + "227:" // Height 5: No activation + "cmp x11, #0x20\n" + "bge 244f\n" + "tbz x11, #4, 235f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v13.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v17.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "st1 { v21.8h }, [x24], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v25.8h }, [x23], #0x10\n" + "tbz x11, #3, 231f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "st1 { v14.8h }, [x26], #0x10\n" + "st1 { v18.8h }, [x25], #0x10\n" + "st1 { v22.8h }, [x24], #0x10\n" + "st1 { v26.8h }, [x23], #0x10\n" + "tbz x11, #2, 229f\n" + "str d11, [x9], #0x8\n" + "str d15, [x26], #0x8\n" + "str d19, [x25], #0x8\n" + "str d23, [x24], #0x8\n" + "str d27, [x23], #0x8\n" + "tbz x11, #1, 228f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x25], #0x4\n" + "st1 { v23.s }[2], [x24], #0x4\n" + "st1 { v27.s }[2], [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v11.h }[6], [x9]\n" + "st1 { v15.h }[6], [x26]\n" + "st1 { v19.h }[6], [x25]\n" + "st1 { v23.h }[6], [x24]\n" + "st1 { v27.h }[6], [x23]\n" + "b 243f\n" + "228:" // Height 5: Partial direct writeback: partial_1_28 + "tbz x11, #0, 243f\n" + "st1 { v11.h }[4], [x9]\n" + "st1 { v15.h }[4], [x26]\n" + "st1 { v19.h }[4], [x25]\n" + "st1 { v23.h }[4], [x24]\n" + "st1 { v27.h }[4], [x23]\n" + "b 243f\n" + "229:" // Height 5: Partial direct writeback: partial_2_24 + "tbz x11, #1, 230f\n" + "str s11, [x9], #0x4\n" + "str s15, [x26], #0x4\n" + "str s19, [x25], #0x4\n" + "str s23, [x24], #0x4\n" + "str s27, [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v11.h }[2], [x9]\n" + "st1 { v15.h }[2], [x26]\n" + "st1 { v19.h }[2], [x25]\n" + "st1 { v23.h }[2], [x24]\n" + "st1 { v27.h }[2], [x23]\n" + "b 243f\n" + "230:" // Height 5: Partial direct writeback: partial_1_24 + "tbz x11, #0, 243f\n" + "str h11, [x9, #0x0]\n" + "str h15, [x26, #0x0]\n" + "str h19, [x25, #0x0]\n" + "str h23, [x24, #0x0]\n" + "str h27, [x23, #0x0]\n" + "b 243f\n" + "231:" // Height 5: Partial direct writeback: partial_4_16 + "tbz x11, #2, 233f\n" + "str d10, [x9], #0x8\n" + "str d14, [x26], #0x8\n" + "str d18, [x25], #0x8\n" + "str d22, [x24], #0x8\n" + "str d26, [x23], #0x8\n" + "tbz x11, #1, 232f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "st1 { v14.s }[2], [x26], #0x4\n" + "st1 { v18.s }[2], [x25], #0x4\n" + "st1 { v22.s }[2], [x24], #0x4\n" + "st1 { v26.s }[2], [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v10.h }[6], [x9]\n" + "st1 { v14.h }[6], [x26]\n" + "st1 { v18.h }[6], [x25]\n" + "st1 { v22.h }[6], [x24]\n" + "st1 { v26.h }[6], [x23]\n" + "b 243f\n" + "232:" // Height 5: Partial direct writeback: partial_1_20 + "tbz x11, #0, 243f\n" + "st1 { v10.h }[4], [x9]\n" + "st1 { v14.h }[4], [x26]\n" + "st1 { v18.h }[4], [x25]\n" + "st1 { v22.h }[4], [x24]\n" + "st1 { v26.h }[4], [x23]\n" + "b 243f\n" + "233:" // Height 5: Partial direct writeback: partial_2_16 + "tbz x11, #1, 234f\n" + "str s10, [x9], #0x4\n" + "str s14, [x26], #0x4\n" + "str s18, [x25], #0x4\n" + "str s22, [x24], #0x4\n" + "str s26, [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v10.h }[2], [x9]\n" + "st1 { v14.h }[2], [x26]\n" + "st1 { v18.h }[2], [x25]\n" + "st1 { v22.h }[2], [x24]\n" + "st1 { v26.h }[2], [x23]\n" + "b 243f\n" + "234:" // Height 5: Partial direct writeback: partial_1_16 + "tbz x11, #0, 243f\n" + "str h10, [x9, #0x0]\n" + "str h14, [x26, #0x0]\n" + "str h18, [x25, #0x0]\n" + "str h22, [x24, #0x0]\n" + "str h26, [x23, #0x0]\n" + "b 243f\n" + "235:" // Height 5: Partial direct writeback: partial_8_0 + "tbz x11, #3, 239f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "tbz x11, #2, 237f\n" + "str d9, [x9], #0x8\n" + "str d13, [x26], #0x8\n" + "str d17, [x25], #0x8\n" + "str d21, [x24], #0x8\n" + "str d25, [x23], #0x8\n" + "tbz x11, #1, 236f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "st1 { v13.s }[2], [x26], #0x4\n" + "st1 { v17.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x24], #0x4\n" + "st1 { v25.s }[2], [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v9.h }[6], [x9]\n" + "st1 { v13.h }[6], [x26]\n" + "st1 { v17.h }[6], [x25]\n" + "st1 { v21.h }[6], [x24]\n" + "st1 { v25.h }[6], [x23]\n" + "b 243f\n" + "236:" // Height 5: Partial direct writeback: partial_1_12 + "tbz x11, #0, 243f\n" + "st1 { v9.h }[4], [x9]\n" + "st1 { v13.h }[4], [x26]\n" + "st1 { v17.h }[4], [x25]\n" + "st1 { v21.h }[4], [x24]\n" + "st1 { v25.h }[4], [x23]\n" + "b 243f\n" + "237:" // Height 5: Partial direct writeback: partial_2_8 + "tbz x11, #1, 238f\n" + "str s9, [x9], #0x4\n" + "str s13, [x26], #0x4\n" + "str s17, [x25], #0x4\n" + "str s21, [x24], #0x4\n" + "str s25, [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v9.h }[2], [x9]\n" + "st1 { v13.h }[2], [x26]\n" + "st1 { v17.h }[2], [x25]\n" + "st1 { v21.h }[2], [x24]\n" + "st1 { v25.h }[2], [x23]\n" + "b 243f\n" + "238:" // Height 5: Partial direct writeback: partial_1_8 + "tbz x11, #0, 243f\n" + "str h9, [x9, #0x0]\n" + "str h13, [x26, #0x0]\n" + "str h17, [x25, #0x0]\n" + "str h21, [x24, #0x0]\n" + "str h25, [x23, #0x0]\n" + "b 243f\n" + "239:" // Height 5: Partial direct writeback: partial_4_0 + "tbz x11, #2, 241f\n" + "str d8, [x9], #0x8\n" + "str d12, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "str d20, [x24], #0x8\n" + "str d24, [x23], #0x8\n" + "tbz x11, #1, 240f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "st1 { v20.s }[2], [x24], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v8.h }[6], [x9]\n" + "st1 { v12.h }[6], [x26]\n" + "st1 { v16.h }[6], [x25]\n" + "st1 { v20.h }[6], [x24]\n" + "st1 { v24.h }[6], [x23]\n" + "b 243f\n" + "240:" // Height 5: Partial direct writeback: partial_1_4 + "tbz x11, #0, 243f\n" + "st1 { v8.h }[4], [x9]\n" + "st1 { v12.h }[4], [x26]\n" + "st1 { v16.h }[4], [x25]\n" + "st1 { v20.h }[4], [x24]\n" + "st1 { v24.h }[4], [x23]\n" + "b 243f\n" + "241:" // Height 5: Partial direct writeback: partial_2_0 + "tbz x11, #1, 242f\n" + "str s8, [x9], #0x4\n" + "str s12, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "str s20, [x24], #0x4\n" + "str s24, [x23], #0x4\n" + "tbz x11, #0, 243f\n" + "st1 { v8.h }[2], [x9]\n" + "st1 { v12.h }[2], [x26]\n" + "st1 { v16.h }[2], [x25]\n" + "st1 { v20.h }[2], [x24]\n" + "st1 { v24.h }[2], [x23]\n" + "b 243f\n" + "242:" // Height 5: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "str h12, [x26, #0x0]\n" + "str h16, [x25, #0x0]\n" + "str h20, [x24, #0x0]\n" + "str h24, [x23, #0x0]\n" + "243:" // Height 5: Partial direct writeback: Done + "b 245f\n" + "244:" // Height 5: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q12, [x26, #0x0]\n" + "str q13, [x26, #0x10]\n" + "str q14, [x26, #0x20]\n" + "str q15, [x26, #0x30]\n" + "str q16, [x25, #0x0]\n" + "str q17, [x25, #0x10]\n" + "str q18, [x25, #0x20]\n" + "str q19, [x25, #0x30]\n" + "str q20, [x24, #0x0]\n" + "str q21, [x24, #0x10]\n" + "str q22, [x24, #0x20]\n" + "str q23, [x24, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "245:" // Height 5: Writeback done + "subs x11, x11, #0x20\n" + "bgt 198b\n" + "b 296f\n" + "246:" // Height 6 + "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" + "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "mov x20, #0xc\n" + "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" + "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" + "madd x20, x21, x20, x9\n" + "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" + "247:" // Height 6: Column loop + "cbz x10, 248f\n" + "ldr q8, [x10, #0x0]\n" + "ldr q9, [x10, #0x10]\n" + "ldr q10, [x10, #0x20]\n" + "ldr q11, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "mov v12.16b, v8.16b\n" + "mov v13.16b, v9.16b\n" + "mov v16.16b, v8.16b\n" + "mov v17.16b, v9.16b\n" + "mov v20.16b, v8.16b\n" + "mov v14.16b, v10.16b\n" + "mov v15.16b, v11.16b\n" + "mov v18.16b, v10.16b\n" + "mov v19.16b, v11.16b\n" + "mov v21.16b, v9.16b\n" + "mov v22.16b, v10.16b\n" + "mov v23.16b, v11.16b\n" + "mov v24.16b, v8.16b\n" + "mov v25.16b, v9.16b\n" + "mov v26.16b, v10.16b\n" + "mov v27.16b, v11.16b\n" + "mov v28.16b, v8.16b\n" + "mov v29.16b, v9.16b\n" + "mov v30.16b, v10.16b\n" + "mov v31.16b, v11.16b\n" + "b 267f\n" + "248:" // Height 6: no bias + "tbz %x[flags], #0, 266f\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "cmp x11, #0x20\n" + "add x26, x9, x20, LSL #1\n" + "add x25, x26, x20, LSL #1\n" + "add x24, x25, x20, LSL #1\n" + "add x23, x24, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" + "bge 265f\n" + "tbz x11, #4, 256f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v28.8h }, [x22], #0x10\n" + "ld1 { v9.8h }, [x9], #0x10\n" + "ld1 { v13.8h }, [x26], #0x10\n" + "ld1 { v17.8h }, [x25], #0x10\n" + "ld1 { v21.8h }, [x24], #0x10\n" + "ld1 { v25.8h }, [x23], #0x10\n" + "ld1 { v29.8h }, [x22], #0x10\n" + "tbz x11, #3, 252f\n" + "ld1 { v10.8h }, [x9], #0x10\n" + "ld1 { v14.8h }, [x26], #0x10\n" + "ld1 { v18.8h }, [x25], #0x10\n" + "ld1 { v22.8h }, [x24], #0x10\n" + "ld1 { v26.8h }, [x23], #0x10\n" + "ld1 { v30.8h }, [x22], #0x10\n" + "tbz x11, #2, 250f\n" + "ldr d11, [x9], #0x8\n" + "ldr d15, [x26], #0x8\n" + "ldr d19, [x25], #0x8\n" + "ldr d23, [x24], #0x8\n" + "ldr d27, [x23], #0x8\n" + "ldr d31, [x22], #0x8\n" + "tbz x11, #1, 249f\n" + "ld1 { v11.s }[2], [x9], #0x4\n" + "ld1 { v15.s }[2], [x26], #0x4\n" + "mov x20, #0x3c\n" + "ld1 { v19.s }[2], [x25], #0x4\n" + "ld1 { v23.s }[2], [x24], #0x4\n" + "ld1 { v27.s }[2], [x23], #0x4\n" + "ld1 { v31.s }[2], [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[6], [x9]\n" + "ld1 { v15.h }[6], [x26]\n" + "ld1 { v19.h }[6], [x25]\n" + "ld1 { v23.h }[6], [x24]\n" + "ld1 { v27.h }[6], [x23]\n" + "ld1 { v31.h }[6], [x22]\n" + "b 264f\n" + "249:" // Height 6: Partial accumulate: partial_1_28 + "mov x20, #0x38\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[4], [x9]\n" + "ld1 { v15.h }[4], [x26]\n" + "ld1 { v19.h }[4], [x25]\n" + "ld1 { v23.h }[4], [x24]\n" + "ld1 { v27.h }[4], [x23]\n" + "ld1 { v31.h }[4], [x22]\n" + "b 264f\n" + "250:" // Height 6: Partial accumulate: partial_2_24 + "tbz x11, #1, 251f\n" + "ldr s11, [x9], #0x4\n" + "ldr s15, [x26], #0x4\n" + "mov x20, #0x34\n" + "ldr s19, [x25], #0x4\n" + "ldr s23, [x24], #0x4\n" + "ldr s27, [x23], #0x4\n" + "ldr s31, [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v11.h }[2], [x9]\n" + "ld1 { v15.h }[2], [x26]\n" + "ld1 { v19.h }[2], [x25]\n" + "ld1 { v23.h }[2], [x24]\n" + "ld1 { v27.h }[2], [x23]\n" + "ld1 { v31.h }[2], [x22]\n" + "b 264f\n" + "251:" // Height 6: Partial accumulate: partial_1_24 + "mov x20, #0x30\n" + "tbz x11, #0, 264f\n" + "ldr h11, [x9, #0x0]\n" + "ldr h15, [x26, #0x0]\n" + "ldr h19, [x25, #0x0]\n" + "ldr h23, [x24, #0x0]\n" + "ldr h27, [x23, #0x0]\n" + "ldr h31, [x22, #0x0]\n" + "b 264f\n" + "252:" // Height 6: Partial accumulate: partial_4_16 + "tbz x11, #2, 254f\n" + "ldr d10, [x9], #0x8\n" + "ldr d14, [x26], #0x8\n" + "ldr d18, [x25], #0x8\n" + "ldr d22, [x24], #0x8\n" + "ldr d26, [x23], #0x8\n" + "ldr d30, [x22], #0x8\n" + "tbz x11, #1, 253f\n" + "ld1 { v10.s }[2], [x9], #0x4\n" + "ld1 { v14.s }[2], [x26], #0x4\n" + "mov x20, #0x2c\n" + "ld1 { v18.s }[2], [x25], #0x4\n" + "ld1 { v22.s }[2], [x24], #0x4\n" + "ld1 { v26.s }[2], [x23], #0x4\n" + "ld1 { v30.s }[2], [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[6], [x9]\n" + "ld1 { v14.h }[6], [x26]\n" + "ld1 { v18.h }[6], [x25]\n" + "ld1 { v22.h }[6], [x24]\n" + "ld1 { v26.h }[6], [x23]\n" + "ld1 { v30.h }[6], [x22]\n" + "b 264f\n" + "253:" // Height 6: Partial accumulate: partial_1_20 + "mov x20, #0x28\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[4], [x9]\n" + "ld1 { v14.h }[4], [x26]\n" + "ld1 { v18.h }[4], [x25]\n" + "ld1 { v22.h }[4], [x24]\n" + "ld1 { v26.h }[4], [x23]\n" + "ld1 { v30.h }[4], [x22]\n" + "b 264f\n" + "254:" // Height 6: Partial accumulate: partial_2_16 + "tbz x11, #1, 255f\n" + "ldr s10, [x9], #0x4\n" + "ldr s14, [x26], #0x4\n" + "mov x20, #0x24\n" + "ldr s18, [x25], #0x4\n" + "ldr s22, [x24], #0x4\n" + "ldr s26, [x23], #0x4\n" + "ldr s30, [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v10.h }[2], [x9]\n" + "ld1 { v14.h }[2], [x26]\n" + "ld1 { v18.h }[2], [x25]\n" + "ld1 { v22.h }[2], [x24]\n" + "ld1 { v26.h }[2], [x23]\n" + "ld1 { v30.h }[2], [x22]\n" + "b 264f\n" + "255:" // Height 6: Partial accumulate: partial_1_16 + "mov x20, #0x20\n" + "tbz x11, #0, 264f\n" + "ldr h10, [x9, #0x0]\n" + "ldr h14, [x26, #0x0]\n" + "ldr h18, [x25, #0x0]\n" + "ldr h22, [x24, #0x0]\n" + "ldr h26, [x23, #0x0]\n" + "ldr h30, [x22, #0x0]\n" + "b 264f\n" + "256:" // Height 6: Partial accumulate: partial_8_0 + "tbz x11, #3, 260f\n" + "ld1 { v8.8h }, [x9], #0x10\n" + "ld1 { v12.8h }, [x26], #0x10\n" + "ld1 { v16.8h }, [x25], #0x10\n" + "ld1 { v20.8h }, [x24], #0x10\n" + "ld1 { v24.8h }, [x23], #0x10\n" + "ld1 { v28.8h }, [x22], #0x10\n" + "tbz x11, #2, 258f\n" + "ldr d9, [x9], #0x8\n" + "ldr d13, [x26], #0x8\n" + "ldr d17, [x25], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d25, [x23], #0x8\n" + "ldr d29, [x22], #0x8\n" + "tbz x11, #1, 257f\n" + "ld1 { v9.s }[2], [x9], #0x4\n" + "ld1 { v13.s }[2], [x26], #0x4\n" + "mov x20, #0x1c\n" + "ld1 { v17.s }[2], [x25], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v25.s }[2], [x23], #0x4\n" + "ld1 { v29.s }[2], [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[6], [x9]\n" + "ld1 { v13.h }[6], [x26]\n" + "ld1 { v17.h }[6], [x25]\n" + "ld1 { v21.h }[6], [x24]\n" + "ld1 { v25.h }[6], [x23]\n" + "ld1 { v29.h }[6], [x22]\n" + "b 264f\n" + "257:" // Height 6: Partial accumulate: partial_1_12 + "mov x20, #0x18\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[4], [x9]\n" + "ld1 { v13.h }[4], [x26]\n" + "ld1 { v17.h }[4], [x25]\n" + "ld1 { v21.h }[4], [x24]\n" + "ld1 { v25.h }[4], [x23]\n" + "ld1 { v29.h }[4], [x22]\n" + "b 264f\n" + "258:" // Height 6: Partial accumulate: partial_2_8 + "tbz x11, #1, 259f\n" + "ldr s9, [x9], #0x4\n" + "ldr s13, [x26], #0x4\n" + "mov x20, #0x14\n" + "ldr s17, [x25], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s25, [x23], #0x4\n" + "ldr s29, [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v9.h }[2], [x9]\n" + "ld1 { v13.h }[2], [x26]\n" + "ld1 { v17.h }[2], [x25]\n" + "ld1 { v21.h }[2], [x24]\n" + "ld1 { v25.h }[2], [x23]\n" + "ld1 { v29.h }[2], [x22]\n" + "b 264f\n" + "259:" // Height 6: Partial accumulate: partial_1_8 + "mov x20, #0x10\n" + "tbz x11, #0, 264f\n" + "ldr h9, [x9, #0x0]\n" + "ldr h13, [x26, #0x0]\n" + "ldr h17, [x25, #0x0]\n" + "ldr h21, [x24, #0x0]\n" + "ldr h25, [x23, #0x0]\n" + "ldr h29, [x22, #0x0]\n" + "b 264f\n" + "260:" // Height 6: Partial accumulate: partial_4_0 + "tbz x11, #2, 262f\n" + "ldr d8, [x9], #0x8\n" + "ldr d12, [x26], #0x8\n" + "ldr d16, [x25], #0x8\n" + "ldr d20, [x24], #0x8\n" + "ldr d24, [x23], #0x8\n" + "ldr d28, [x22], #0x8\n" + "tbz x11, #1, 261f\n" + "ld1 { v8.s }[2], [x9], #0x4\n" + "ld1 { v12.s }[2], [x26], #0x4\n" + "mov x20, #0xc\n" + "ld1 { v16.s }[2], [x25], #0x4\n" + "ld1 { v20.s }[2], [x24], #0x4\n" + "ld1 { v24.s }[2], [x23], #0x4\n" + "ld1 { v28.s }[2], [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[6], [x9]\n" + "ld1 { v12.h }[6], [x26]\n" + "ld1 { v16.h }[6], [x25]\n" + "ld1 { v20.h }[6], [x24]\n" + "ld1 { v24.h }[6], [x23]\n" + "ld1 { v28.h }[6], [x22]\n" + "b 264f\n" + "261:" // Height 6: Partial accumulate: partial_1_4 + "mov x20, #0x8\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[4], [x9]\n" + "ld1 { v12.h }[4], [x26]\n" + "ld1 { v16.h }[4], [x25]\n" + "ld1 { v20.h }[4], [x24]\n" + "ld1 { v24.h }[4], [x23]\n" + "ld1 { v28.h }[4], [x22]\n" + "b 264f\n" + "262:" // Height 6: Partial accumulate: partial_2_0 + "tbz x11, #1, 263f\n" + "ldr s8, [x9], #0x4\n" + "ldr s12, [x26], #0x4\n" + "mov x20, #0x4\n" + "ldr s16, [x25], #0x4\n" + "ldr s20, [x24], #0x4\n" + "ldr s24, [x23], #0x4\n" + "ldr s28, [x22], #0x4\n" + "tbz x11, #0, 264f\n" + "ld1 { v8.h }[2], [x9]\n" + "ld1 { v12.h }[2], [x26]\n" + "ld1 { v16.h }[2], [x25]\n" + "ld1 { v20.h }[2], [x24]\n" + "ld1 { v24.h }[2], [x23]\n" + "ld1 { v28.h }[2], [x22]\n" + "b 264f\n" + "263:" // Height 6: Partial accumulate: partial_1_0 + "ldr h8, [x9, #0x0]\n" + "ldr h12, [x26, #0x0]\n" + "mov x20, #0x0\n" + "ldr h16, [x25, #0x0]\n" + "ldr h20, [x24, #0x0]\n" + "ldr h24, [x23, #0x0]\n" + "ldr h28, [x22, #0x0]\n" + "264:" // Height 6: Partial accumulate: Done + "sub x9, x9, x20\n" + "b 267f\n" + "265:" // Height 6: full accumulate + "ldr q8, [x9, #0x0]\n" + "ldr q9, [x9, #0x10]\n" + "ldr q10, [x9, #0x20]\n" + "ldr q11, [x9, #0x30]\n" + "ldr q12, [x26, #0x0]\n" + "ldr q13, [x26, #0x10]\n" + "ldr q14, [x26, #0x20]\n" + "ldr q15, [x26, #0x30]\n" + "ldr q16, [x25, #0x0]\n" + "ldr q17, [x25, #0x10]\n" + "ldr q18, [x25, #0x20]\n" + "ldr q19, [x25, #0x30]\n" + "ldr q20, [x24, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q22, [x24, #0x20]\n" + "ldr q23, [x24, #0x30]\n" + "ldr q24, [x23, #0x0]\n" + "ldr q25, [x23, #0x10]\n" + "ldr q26, [x23, #0x20]\n" + "ldr q27, [x23, #0x30]\n" + "ldr q28, [x22, #0x0]\n" + "ldr q29, [x22, #0x10]\n" + "ldr q30, [x22, #0x20]\n" + "ldr q31, [x22, #0x30]\n" + "b 267f\n" + "266:" // Height 6: no accumulate + "movi v8.16b, #0x0\n" + "movi v9.16b, #0x0\n" + "movi v10.16b, #0x0\n" + "movi v11.16b, #0x0\n" + "movi v12.16b, #0x0\n" + "movi v13.16b, #0x0\n" + "movi v14.16b, #0x0\n" + "movi v15.16b, #0x0\n" + "movi v16.16b, #0x0\n" + "movi v17.16b, #0x0\n" + "movi v18.16b, #0x0\n" + "movi v19.16b, #0x0\n" + "movi v20.16b, #0x0\n" + "movi v21.16b, #0x0\n" + "movi v22.16b, #0x0\n" + "movi v23.16b, #0x0\n" + "movi v24.16b, #0x0\n" + "movi v25.16b, #0x0\n" + "movi v26.16b, #0x0\n" + "movi v27.16b, #0x0\n" + "movi v28.16b, #0x0\n" + "movi v29.16b, #0x0\n" + "movi v30.16b, #0x0\n" + "movi v31.16b, #0x0\n" + "267:" // Height 6: setup done + "mov x28, #0x0\n" + "268:" // Height 6: String loop + "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "ldr w27, [x20, x28, LSL #0x2]\n" + "tbz %x[flags], #3, 269f\n" + "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" + "add x20, x20, x21, LSL #3\n" + "ldr x26, [x20, #0x0]\n" + "ldr x25, [x20, #0x8]\n" + "ldr x24, [x20, #0x10]\n" + "ldr x23, [x20, #0x18]\n" + "ldr x22, [x20, #0x20]\n" + "ldr x21, [x20, #0x28]\n" + "cbnz x28, 270f\n" + "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" + "add x26, x26, x20, LSL #1\n" + "add x25, x25, x20, LSL #1\n" + "add x24, x24, x20, LSL #1\n" + "add x23, x23, x20, LSL #1\n" + "add x22, x22, x20, LSL #1\n" + "add x21, x21, x20, LSL #1\n" + "b 270f\n" + "269:" // Height 6: setup direct input + "mov x26, %x[input_ptr]\n" + "add x25, x26, x21, LSL #1\n" + "add x24, x25, x21, LSL #1\n" + "add x23, x24, x21, LSL #1\n" + "add x22, x23, x21, LSL #1\n" + "add x21, x22, x21, LSL #1\n" + "270:" // Height 6: input setup done + "cmp x27, #0x8\n" + "blt 273f\n" + "ldr q0, [x26, #0x0]\n" + "ldr q1, [x25, #0x0]\n" + "cmp x27, #0x10\n" + "ldr q2, [x24, #0x0]\n" + "ldr q3, [x23, #0x0]\n" + "ldr q4, [x22, #0x0]\n" + "ldr q5, [x21, #0x0]\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "blt 272f\n" + "271:" // Height 6: Multiply loop: Main loop head + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "add x26, x26, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x25, x25, #0x10\n" + "add x24, x24, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "add x23, x23, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x22, x22, #0x10\n" + "add x21, x21, #0x10\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "cmp x27, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x22, #0x80]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x10, #0x0]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "ldr q0, [x26, #0x0]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "ldr q1, [x25, #0x0]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "ldr q2, [x24, #0x0]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "ldr q3, [x23, #0x0]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "ldr q4, [x22, #0x0]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "ldr q5, [x21, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "bge 271b\n" + "272:" // Height 6: Multiply loop: Single iteration only + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "add x26, x26, #0x10\n" + "add x25, x25, #0x10\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "add x22, x22, #0x10\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "add x21, x21, #0x10\n" + "prfm pldl1keep, [x26, #0x80]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "prfm pldl1keep, [x25, #0x80]\n" + "prfm pldl1keep, [x24, #0x80]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "prfm pldl1keep, [x23, #0x80]\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "sub x27, x27, #0x8\n" + "prfm pldl1keep, [x22, #0x80]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "prfm pldl1keep, [x21, #0x80]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x40]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x50]\n" + "fmla v8.8h, v6.8h, v0.h[1]\n" + "fmla v12.8h, v6.8h, v1.h[1]\n" + "fmla v16.8h, v6.8h, v2.h[1]\n" + "fmla v20.8h, v6.8h, v3.h[1]\n" + "fmla v24.8h, v6.8h, v4.h[1]\n" + "fmla v28.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x10, #0x60]\n" + "fmla v9.8h, v7.8h, v0.h[1]\n" + "fmla v13.8h, v7.8h, v1.h[1]\n" + "fmla v17.8h, v7.8h, v2.h[1]\n" + "fmla v21.8h, v7.8h, v3.h[1]\n" + "fmla v25.8h, v7.8h, v4.h[1]\n" + "fmla v29.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x10, #0x70]\n" + "fmla v10.8h, v6.8h, v0.h[1]\n" + "fmla v14.8h, v6.8h, v1.h[1]\n" + "fmla v18.8h, v6.8h, v2.h[1]\n" + "fmla v22.8h, v6.8h, v3.h[1]\n" + "fmla v26.8h, v6.8h, v4.h[1]\n" + "fmla v30.8h, v6.8h, v5.h[1]\n" + "ldr q6, [x10, #0x80]\n" + "fmla v11.8h, v7.8h, v0.h[1]\n" + "fmla v15.8h, v7.8h, v1.h[1]\n" + "fmla v19.8h, v7.8h, v2.h[1]\n" + "fmla v23.8h, v7.8h, v3.h[1]\n" + "fmla v27.8h, v7.8h, v4.h[1]\n" + "fmla v31.8h, v7.8h, v5.h[1]\n" + "ldr q7, [x10, #0x90]\n" + "fmla v8.8h, v6.8h, v0.h[2]\n" + "fmla v12.8h, v6.8h, v1.h[2]\n" + "fmla v16.8h, v6.8h, v2.h[2]\n" + "fmla v20.8h, v6.8h, v3.h[2]\n" + "fmla v24.8h, v6.8h, v4.h[2]\n" + "fmla v28.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x10, #0xa0]\n" + "fmla v9.8h, v7.8h, v0.h[2]\n" + "fmla v13.8h, v7.8h, v1.h[2]\n" + "fmla v17.8h, v7.8h, v2.h[2]\n" + "fmla v21.8h, v7.8h, v3.h[2]\n" + "fmla v25.8h, v7.8h, v4.h[2]\n" + "fmla v29.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x10, #0xb0]\n" + "fmla v10.8h, v6.8h, v0.h[2]\n" + "fmla v14.8h, v6.8h, v1.h[2]\n" + "fmla v18.8h, v6.8h, v2.h[2]\n" + "fmla v22.8h, v6.8h, v3.h[2]\n" + "fmla v26.8h, v6.8h, v4.h[2]\n" + "fmla v30.8h, v6.8h, v5.h[2]\n" + "ldr q6, [x10, #0xc0]\n" + "fmla v11.8h, v7.8h, v0.h[2]\n" + "fmla v15.8h, v7.8h, v1.h[2]\n" + "fmla v19.8h, v7.8h, v2.h[2]\n" + "fmla v23.8h, v7.8h, v3.h[2]\n" + "fmla v27.8h, v7.8h, v4.h[2]\n" + "fmla v31.8h, v7.8h, v5.h[2]\n" + "ldr q7, [x10, #0xd0]\n" + "fmla v8.8h, v6.8h, v0.h[3]\n" + "fmla v12.8h, v6.8h, v1.h[3]\n" + "fmla v16.8h, v6.8h, v2.h[3]\n" + "fmla v20.8h, v6.8h, v3.h[3]\n" + "fmla v24.8h, v6.8h, v4.h[3]\n" + "fmla v28.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x10, #0xe0]\n" + "fmla v9.8h, v7.8h, v0.h[3]\n" + "fmla v13.8h, v7.8h, v1.h[3]\n" + "fmla v17.8h, v7.8h, v2.h[3]\n" + "fmla v21.8h, v7.8h, v3.h[3]\n" + "fmla v25.8h, v7.8h, v4.h[3]\n" + "fmla v29.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x10, #0xf0]\n" + "fmla v10.8h, v6.8h, v0.h[3]\n" + "fmla v14.8h, v6.8h, v1.h[3]\n" + "fmla v18.8h, v6.8h, v2.h[3]\n" + "fmla v22.8h, v6.8h, v3.h[3]\n" + "fmla v26.8h, v6.8h, v4.h[3]\n" + "fmla v30.8h, v6.8h, v5.h[3]\n" + "ldr q6, [x10, #0x100]\n" + "fmla v11.8h, v7.8h, v0.h[3]\n" + "fmla v15.8h, v7.8h, v1.h[3]\n" + "fmla v19.8h, v7.8h, v2.h[3]\n" + "fmla v23.8h, v7.8h, v3.h[3]\n" + "fmla v27.8h, v7.8h, v4.h[3]\n" + "fmla v31.8h, v7.8h, v5.h[3]\n" + "ldr q7, [x10, #0x110]\n" + "fmla v8.8h, v6.8h, v0.h[4]\n" + "fmla v12.8h, v6.8h, v1.h[4]\n" + "fmla v16.8h, v6.8h, v2.h[4]\n" + "fmla v20.8h, v6.8h, v3.h[4]\n" + "fmla v24.8h, v6.8h, v4.h[4]\n" + "fmla v28.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x10, #0x120]\n" + "fmla v9.8h, v7.8h, v0.h[4]\n" + "fmla v13.8h, v7.8h, v1.h[4]\n" + "fmla v17.8h, v7.8h, v2.h[4]\n" + "fmla v21.8h, v7.8h, v3.h[4]\n" + "fmla v25.8h, v7.8h, v4.h[4]\n" + "fmla v29.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x10, #0x130]\n" + "fmla v10.8h, v6.8h, v0.h[4]\n" + "fmla v14.8h, v6.8h, v1.h[4]\n" + "fmla v18.8h, v6.8h, v2.h[4]\n" + "fmla v22.8h, v6.8h, v3.h[4]\n" + "fmla v26.8h, v6.8h, v4.h[4]\n" + "fmla v30.8h, v6.8h, v5.h[4]\n" + "ldr q6, [x10, #0x140]\n" + "fmla v11.8h, v7.8h, v0.h[4]\n" + "fmla v15.8h, v7.8h, v1.h[4]\n" + "fmla v19.8h, v7.8h, v2.h[4]\n" + "fmla v23.8h, v7.8h, v3.h[4]\n" + "fmla v27.8h, v7.8h, v4.h[4]\n" + "fmla v31.8h, v7.8h, v5.h[4]\n" + "ldr q7, [x10, #0x150]\n" + "fmla v8.8h, v6.8h, v0.h[5]\n" + "fmla v12.8h, v6.8h, v1.h[5]\n" + "fmla v16.8h, v6.8h, v2.h[5]\n" + "fmla v20.8h, v6.8h, v3.h[5]\n" + "fmla v24.8h, v6.8h, v4.h[5]\n" + "fmla v28.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x10, #0x160]\n" + "fmla v9.8h, v7.8h, v0.h[5]\n" + "fmla v13.8h, v7.8h, v1.h[5]\n" + "fmla v17.8h, v7.8h, v2.h[5]\n" + "fmla v21.8h, v7.8h, v3.h[5]\n" + "fmla v25.8h, v7.8h, v4.h[5]\n" + "fmla v29.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x10, #0x170]\n" + "fmla v10.8h, v6.8h, v0.h[5]\n" + "fmla v14.8h, v6.8h, v1.h[5]\n" + "fmla v18.8h, v6.8h, v2.h[5]\n" + "fmla v22.8h, v6.8h, v3.h[5]\n" + "fmla v26.8h, v6.8h, v4.h[5]\n" + "fmla v30.8h, v6.8h, v5.h[5]\n" + "ldr q6, [x10, #0x180]\n" + "fmla v11.8h, v7.8h, v0.h[5]\n" + "fmla v15.8h, v7.8h, v1.h[5]\n" + "fmla v19.8h, v7.8h, v2.h[5]\n" + "fmla v23.8h, v7.8h, v3.h[5]\n" + "fmla v27.8h, v7.8h, v4.h[5]\n" + "fmla v31.8h, v7.8h, v5.h[5]\n" + "ldr q7, [x10, #0x190]\n" + "fmla v8.8h, v6.8h, v0.h[6]\n" + "fmla v12.8h, v6.8h, v1.h[6]\n" + "fmla v16.8h, v6.8h, v2.h[6]\n" + "fmla v20.8h, v6.8h, v3.h[6]\n" + "fmla v24.8h, v6.8h, v4.h[6]\n" + "fmla v28.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x10, #0x1a0]\n" + "fmla v9.8h, v7.8h, v0.h[6]\n" + "fmla v13.8h, v7.8h, v1.h[6]\n" + "fmla v17.8h, v7.8h, v2.h[6]\n" + "fmla v21.8h, v7.8h, v3.h[6]\n" + "fmla v25.8h, v7.8h, v4.h[6]\n" + "fmla v29.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x10, #0x1b0]\n" + "fmla v10.8h, v6.8h, v0.h[6]\n" + "fmla v14.8h, v6.8h, v1.h[6]\n" + "fmla v18.8h, v6.8h, v2.h[6]\n" + "fmla v22.8h, v6.8h, v3.h[6]\n" + "fmla v26.8h, v6.8h, v4.h[6]\n" + "fmla v30.8h, v6.8h, v5.h[6]\n" + "ldr q6, [x10, #0x1c0]\n" + "fmla v11.8h, v7.8h, v0.h[6]\n" + "fmla v15.8h, v7.8h, v1.h[6]\n" + "fmla v19.8h, v7.8h, v2.h[6]\n" + "fmla v23.8h, v7.8h, v3.h[6]\n" + "fmla v27.8h, v7.8h, v4.h[6]\n" + "fmla v31.8h, v7.8h, v5.h[6]\n" + "ldr q7, [x10, #0x1d0]\n" + "fmla v8.8h, v6.8h, v0.h[7]\n" + "fmla v12.8h, v6.8h, v1.h[7]\n" + "fmla v16.8h, v6.8h, v2.h[7]\n" + "fmla v20.8h, v6.8h, v3.h[7]\n" + "fmla v24.8h, v6.8h, v4.h[7]\n" + "fmla v28.8h, v6.8h, v5.h[7]\n" + "ldr q6, [x10, #0x1e0]\n" + "fmla v9.8h, v7.8h, v0.h[7]\n" + "fmla v13.8h, v7.8h, v1.h[7]\n" + "fmla v17.8h, v7.8h, v2.h[7]\n" + "fmla v21.8h, v7.8h, v3.h[7]\n" + "fmla v25.8h, v7.8h, v4.h[7]\n" + "fmla v29.8h, v7.8h, v5.h[7]\n" + "ldr q7, [x10, #0x1f0]\n" + "add x10, x10, #0x200\n" + "fmla v10.8h, v6.8h, v0.h[7]\n" + "fmla v14.8h, v6.8h, v1.h[7]\n" + "fmla v18.8h, v6.8h, v2.h[7]\n" + "fmla v22.8h, v6.8h, v3.h[7]\n" + "fmla v26.8h, v6.8h, v4.h[7]\n" + "fmla v30.8h, v6.8h, v5.h[7]\n" + "fmla v11.8h, v7.8h, v0.h[7]\n" + "fmla v15.8h, v7.8h, v1.h[7]\n" + "fmla v19.8h, v7.8h, v2.h[7]\n" + "fmla v23.8h, v7.8h, v3.h[7]\n" + "fmla v27.8h, v7.8h, v4.h[7]\n" + "fmla v31.8h, v7.8h, v5.h[7]\n" + "273:" // Height 6: Multiply loop: Main loop skip + "cbz x27, 275f\n" + "274:" // Height 6: Multiply loop: Odd block loop + "ldr h0, [x26], #0x2\n" + "ldr h1, [x25], #0x2\n" + "sub x27, x27, #0x1\n" + "ldr h2, [x24], #0x2\n" + "ldr h3, [x23], #0x2\n" + "ldr h4, [x22], #0x2\n" + "ldr h5, [x21], #0x2\n" + "ldr q6, [x10, #0x0]\n" + "ldr q7, [x10, #0x10]\n" + "fmla v8.8h, v6.8h, v0.h[0]\n" + "fmla v12.8h, v6.8h, v1.h[0]\n" + "fmla v16.8h, v6.8h, v2.h[0]\n" + "fmla v20.8h, v6.8h, v3.h[0]\n" + "fmla v24.8h, v6.8h, v4.h[0]\n" + "fmla v28.8h, v6.8h, v5.h[0]\n" + "ldr q6, [x10, #0x20]\n" + "fmla v9.8h, v7.8h, v0.h[0]\n" + "fmla v13.8h, v7.8h, v1.h[0]\n" + "fmla v17.8h, v7.8h, v2.h[0]\n" + "fmla v21.8h, v7.8h, v3.h[0]\n" + "fmla v25.8h, v7.8h, v4.h[0]\n" + "fmla v29.8h, v7.8h, v5.h[0]\n" + "ldr q7, [x10, #0x30]\n" + "add x10, x10, #0x40\n" + "fmla v10.8h, v6.8h, v0.h[0]\n" + "fmla v14.8h, v6.8h, v1.h[0]\n" + "fmla v18.8h, v6.8h, v2.h[0]\n" + "fmla v22.8h, v6.8h, v3.h[0]\n" + "fmla v26.8h, v6.8h, v4.h[0]\n" + "fmla v30.8h, v6.8h, v5.h[0]\n" + "fmla v11.8h, v7.8h, v0.h[0]\n" + "fmla v15.8h, v7.8h, v1.h[0]\n" + "fmla v19.8h, v7.8h, v2.h[0]\n" + "fmla v23.8h, v7.8h, v3.h[0]\n" + "fmla v27.8h, v7.8h, v4.h[0]\n" + "fmla v31.8h, v7.8h, v5.h[0]\n" + "cbnz x27, 274b\n" + "275:" // Height 6: Multiply loop: No odd multiplies + "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" + "add x28, x28, #0x1\n" + "cmp x28, x20\n" + "bne 268b\n" + "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" + "prfm pstl1keep, [x9, #0x0]\n" + "add x26, x9, x20, LSL #1\n" + "prfm pstl1keep, [x26, #0x0]\n" + "add x25, x26, x20, LSL #1\n" + "prfm pstl1keep, [x25, #0x0]\n" + "add x24, x25, x20, LSL #1\n" + "prfm pstl1keep, [x24, #0x0]\n" + "add x23, x24, x20, LSL #1\n" + "add x22, x23, x20, LSL #1\n" + "prfm pstl1keep, [x23, #0x0]\n" + "prfm pstl1keep, [x22, #0x0]\n" + "tbz %x[flags], #1, 276f\n" + "add x21, %x[args_ptr], %[offset_max]\n" + "add x20, %x[args_ptr], %[offset_min]\n" + "ld1r { v1.8h }, [x21]\n" + "ld1r { v0.8h }, [x20]\n" + "fmin v8.8h, v8.8h, v1.8h\n" + "fmin v9.8h, v9.8h, v1.8h\n" + "fmin v10.8h, v10.8h, v1.8h\n" + "fmin v11.8h, v11.8h, v1.8h\n" + "fmin v12.8h, v12.8h, v1.8h\n" + "fmin v13.8h, v13.8h, v1.8h\n" + "fmin v14.8h, v14.8h, v1.8h\n" + "fmin v15.8h, v15.8h, v1.8h\n" + "fmin v16.8h, v16.8h, v1.8h\n" + "fmin v17.8h, v17.8h, v1.8h\n" + "fmin v18.8h, v18.8h, v1.8h\n" + "fmin v19.8h, v19.8h, v1.8h\n" + "fmin v20.8h, v20.8h, v1.8h\n" + "fmin v21.8h, v21.8h, v1.8h\n" + "fmin v22.8h, v22.8h, v1.8h\n" + "fmin v23.8h, v23.8h, v1.8h\n" + "fmin v24.8h, v24.8h, v1.8h\n" + "fmin v25.8h, v25.8h, v1.8h\n" + "fmin v26.8h, v26.8h, v1.8h\n" + "fmin v27.8h, v27.8h, v1.8h\n" + "fmin v28.8h, v28.8h, v1.8h\n" + "fmin v29.8h, v29.8h, v1.8h\n" + "fmin v30.8h, v30.8h, v1.8h\n" + "fmin v31.8h, v31.8h, v1.8h\n" + "fmax v8.8h, v8.8h, v0.8h\n" + "fmax v9.8h, v9.8h, v0.8h\n" + "fmax v10.8h, v10.8h, v0.8h\n" + "fmax v11.8h, v11.8h, v0.8h\n" + "fmax v12.8h, v12.8h, v0.8h\n" + "fmax v13.8h, v13.8h, v0.8h\n" + "fmax v14.8h, v14.8h, v0.8h\n" + "fmax v15.8h, v15.8h, v0.8h\n" + "fmax v16.8h, v16.8h, v0.8h\n" + "fmax v17.8h, v17.8h, v0.8h\n" + "fmax v18.8h, v18.8h, v0.8h\n" + "fmax v19.8h, v19.8h, v0.8h\n" + "fmax v20.8h, v20.8h, v0.8h\n" + "fmax v21.8h, v21.8h, v0.8h\n" + "fmax v22.8h, v22.8h, v0.8h\n" + "fmax v23.8h, v23.8h, v0.8h\n" + "fmax v24.8h, v24.8h, v0.8h\n" + "fmax v25.8h, v25.8h, v0.8h\n" + "fmax v26.8h, v26.8h, v0.8h\n" + "fmax v27.8h, v27.8h, v0.8h\n" + "fmax v28.8h, v28.8h, v0.8h\n" + "fmax v29.8h, v29.8h, v0.8h\n" + "fmax v30.8h, v30.8h, v0.8h\n" + "fmax v31.8h, v31.8h, v0.8h\n" + "276:" // Height 6: No activation + "cmp x11, #0x20\n" + "bge 293f\n" + "tbz x11, #4, 284f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v9.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v13.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v17.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "st1 { v21.8h }, [x24], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v25.8h }, [x23], #0x10\n" + "st1 { v28.8h }, [x22], #0x10\n" + "st1 { v29.8h }, [x22], #0x10\n" + "tbz x11, #3, 280f\n" + "st1 { v10.8h }, [x9], #0x10\n" + "st1 { v14.8h }, [x26], #0x10\n" + "st1 { v18.8h }, [x25], #0x10\n" + "st1 { v22.8h }, [x24], #0x10\n" + "st1 { v26.8h }, [x23], #0x10\n" + "st1 { v30.8h }, [x22], #0x10\n" + "tbz x11, #2, 278f\n" + "str d11, [x9], #0x8\n" + "str d15, [x26], #0x8\n" + "str d19, [x25], #0x8\n" + "str d23, [x24], #0x8\n" + "str d27, [x23], #0x8\n" + "str d31, [x22], #0x8\n" + "tbz x11, #1, 277f\n" + "st1 { v11.s }[2], [x9], #0x4\n" + "st1 { v15.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x25], #0x4\n" + "st1 { v23.s }[2], [x24], #0x4\n" + "st1 { v27.s }[2], [x23], #0x4\n" + "st1 { v31.s }[2], [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v11.h }[6], [x9]\n" + "st1 { v15.h }[6], [x26]\n" + "st1 { v19.h }[6], [x25]\n" + "st1 { v23.h }[6], [x24]\n" + "st1 { v27.h }[6], [x23]\n" + "st1 { v31.h }[6], [x22]\n" + "b 292f\n" + "277:" // Height 6: Partial direct writeback: partial_1_28 + "tbz x11, #0, 292f\n" + "st1 { v11.h }[4], [x9]\n" + "st1 { v15.h }[4], [x26]\n" + "st1 { v19.h }[4], [x25]\n" + "st1 { v23.h }[4], [x24]\n" + "st1 { v27.h }[4], [x23]\n" + "st1 { v31.h }[4], [x22]\n" + "b 292f\n" + "278:" // Height 6: Partial direct writeback: partial_2_24 + "tbz x11, #1, 279f\n" + "str s11, [x9], #0x4\n" + "str s15, [x26], #0x4\n" + "str s19, [x25], #0x4\n" + "str s23, [x24], #0x4\n" + "str s27, [x23], #0x4\n" + "str s31, [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v11.h }[2], [x9]\n" + "st1 { v15.h }[2], [x26]\n" + "st1 { v19.h }[2], [x25]\n" + "st1 { v23.h }[2], [x24]\n" + "st1 { v27.h }[2], [x23]\n" + "st1 { v31.h }[2], [x22]\n" + "b 292f\n" + "279:" // Height 6: Partial direct writeback: partial_1_24 + "tbz x11, #0, 292f\n" + "str h11, [x9, #0x0]\n" + "str h15, [x26, #0x0]\n" + "str h19, [x25, #0x0]\n" + "str h23, [x24, #0x0]\n" + "str h27, [x23, #0x0]\n" + "str h31, [x22, #0x0]\n" + "b 292f\n" + "280:" // Height 6: Partial direct writeback: partial_4_16 + "tbz x11, #2, 282f\n" + "str d10, [x9], #0x8\n" + "str d14, [x26], #0x8\n" + "str d18, [x25], #0x8\n" + "str d22, [x24], #0x8\n" + "str d26, [x23], #0x8\n" + "str d30, [x22], #0x8\n" + "tbz x11, #1, 281f\n" + "st1 { v10.s }[2], [x9], #0x4\n" + "st1 { v14.s }[2], [x26], #0x4\n" + "st1 { v18.s }[2], [x25], #0x4\n" + "st1 { v22.s }[2], [x24], #0x4\n" + "st1 { v26.s }[2], [x23], #0x4\n" + "st1 { v30.s }[2], [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v10.h }[6], [x9]\n" + "st1 { v14.h }[6], [x26]\n" + "st1 { v18.h }[6], [x25]\n" + "st1 { v22.h }[6], [x24]\n" + "st1 { v26.h }[6], [x23]\n" + "st1 { v30.h }[6], [x22]\n" + "b 292f\n" + "281:" // Height 6: Partial direct writeback: partial_1_20 + "tbz x11, #0, 292f\n" + "st1 { v10.h }[4], [x9]\n" + "st1 { v14.h }[4], [x26]\n" + "st1 { v18.h }[4], [x25]\n" + "st1 { v22.h }[4], [x24]\n" + "st1 { v26.h }[4], [x23]\n" + "st1 { v30.h }[4], [x22]\n" + "b 292f\n" + "282:" // Height 6: Partial direct writeback: partial_2_16 + "tbz x11, #1, 283f\n" + "str s10, [x9], #0x4\n" + "str s14, [x26], #0x4\n" + "str s18, [x25], #0x4\n" + "str s22, [x24], #0x4\n" + "str s26, [x23], #0x4\n" + "str s30, [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v10.h }[2], [x9]\n" + "st1 { v14.h }[2], [x26]\n" + "st1 { v18.h }[2], [x25]\n" + "st1 { v22.h }[2], [x24]\n" + "st1 { v26.h }[2], [x23]\n" + "st1 { v30.h }[2], [x22]\n" + "b 292f\n" + "283:" // Height 6: Partial direct writeback: partial_1_16 + "tbz x11, #0, 292f\n" + "str h10, [x9, #0x0]\n" + "str h14, [x26, #0x0]\n" + "str h18, [x25, #0x0]\n" + "str h22, [x24, #0x0]\n" + "str h26, [x23, #0x0]\n" + "str h30, [x22, #0x0]\n" + "b 292f\n" + "284:" // Height 6: Partial direct writeback: partial_8_0 + "tbz x11, #3, 288f\n" + "st1 { v8.8h }, [x9], #0x10\n" + "st1 { v12.8h }, [x26], #0x10\n" + "st1 { v16.8h }, [x25], #0x10\n" + "st1 { v20.8h }, [x24], #0x10\n" + "st1 { v24.8h }, [x23], #0x10\n" + "st1 { v28.8h }, [x22], #0x10\n" + "tbz x11, #2, 286f\n" + "str d9, [x9], #0x8\n" + "str d13, [x26], #0x8\n" + "str d17, [x25], #0x8\n" + "str d21, [x24], #0x8\n" + "str d25, [x23], #0x8\n" + "str d29, [x22], #0x8\n" + "tbz x11, #1, 285f\n" + "st1 { v9.s }[2], [x9], #0x4\n" + "st1 { v13.s }[2], [x26], #0x4\n" + "st1 { v17.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x24], #0x4\n" + "st1 { v25.s }[2], [x23], #0x4\n" + "st1 { v29.s }[2], [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v9.h }[6], [x9]\n" + "st1 { v13.h }[6], [x26]\n" + "st1 { v17.h }[6], [x25]\n" + "st1 { v21.h }[6], [x24]\n" + "st1 { v25.h }[6], [x23]\n" + "st1 { v29.h }[6], [x22]\n" + "b 292f\n" + "285:" // Height 6: Partial direct writeback: partial_1_12 + "tbz x11, #0, 292f\n" + "st1 { v9.h }[4], [x9]\n" + "st1 { v13.h }[4], [x26]\n" + "st1 { v17.h }[4], [x25]\n" + "st1 { v21.h }[4], [x24]\n" + "st1 { v25.h }[4], [x23]\n" + "st1 { v29.h }[4], [x22]\n" + "b 292f\n" + "286:" // Height 6: Partial direct writeback: partial_2_8 + "tbz x11, #1, 287f\n" + "str s9, [x9], #0x4\n" + "str s13, [x26], #0x4\n" + "str s17, [x25], #0x4\n" + "str s21, [x24], #0x4\n" + "str s25, [x23], #0x4\n" + "str s29, [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v9.h }[2], [x9]\n" + "st1 { v13.h }[2], [x26]\n" + "st1 { v17.h }[2], [x25]\n" + "st1 { v21.h }[2], [x24]\n" + "st1 { v25.h }[2], [x23]\n" + "st1 { v29.h }[2], [x22]\n" + "b 292f\n" + "287:" // Height 6: Partial direct writeback: partial_1_8 + "tbz x11, #0, 292f\n" + "str h9, [x9, #0x0]\n" + "str h13, [x26, #0x0]\n" + "str h17, [x25, #0x0]\n" + "str h21, [x24, #0x0]\n" + "str h25, [x23, #0x0]\n" + "str h29, [x22, #0x0]\n" + "b 292f\n" + "288:" // Height 6: Partial direct writeback: partial_4_0 + "tbz x11, #2, 290f\n" + "str d8, [x9], #0x8\n" + "str d12, [x26], #0x8\n" + "str d16, [x25], #0x8\n" + "str d20, [x24], #0x8\n" + "str d24, [x23], #0x8\n" + "str d28, [x22], #0x8\n" + "tbz x11, #1, 289f\n" + "st1 { v8.s }[2], [x9], #0x4\n" + "st1 { v12.s }[2], [x26], #0x4\n" + "st1 { v16.s }[2], [x25], #0x4\n" + "st1 { v20.s }[2], [x24], #0x4\n" + "st1 { v24.s }[2], [x23], #0x4\n" + "st1 { v28.s }[2], [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v8.h }[6], [x9]\n" + "st1 { v12.h }[6], [x26]\n" + "st1 { v16.h }[6], [x25]\n" + "st1 { v20.h }[6], [x24]\n" + "st1 { v24.h }[6], [x23]\n" + "st1 { v28.h }[6], [x22]\n" + "b 292f\n" + "289:" // Height 6: Partial direct writeback: partial_1_4 + "tbz x11, #0, 292f\n" + "st1 { v8.h }[4], [x9]\n" + "st1 { v12.h }[4], [x26]\n" + "st1 { v16.h }[4], [x25]\n" + "st1 { v20.h }[4], [x24]\n" + "st1 { v24.h }[4], [x23]\n" + "st1 { v28.h }[4], [x22]\n" + "b 292f\n" + "290:" // Height 6: Partial direct writeback: partial_2_0 + "tbz x11, #1, 291f\n" + "str s8, [x9], #0x4\n" + "str s12, [x26], #0x4\n" + "str s16, [x25], #0x4\n" + "str s20, [x24], #0x4\n" + "str s24, [x23], #0x4\n" + "str s28, [x22], #0x4\n" + "tbz x11, #0, 292f\n" + "st1 { v8.h }[2], [x9]\n" + "st1 { v12.h }[2], [x26]\n" + "st1 { v16.h }[2], [x25]\n" + "st1 { v20.h }[2], [x24]\n" + "st1 { v24.h }[2], [x23]\n" + "st1 { v28.h }[2], [x22]\n" + "b 292f\n" + "291:" // Height 6: Partial direct writeback: partial_1_0 + "str h8, [x9, #0x0]\n" + "str h12, [x26, #0x0]\n" + "str h16, [x25, #0x0]\n" + "str h20, [x24, #0x0]\n" + "str h24, [x23, #0x0]\n" + "str h28, [x22, #0x0]\n" + "292:" // Height 6: Partial direct writeback: Done + "b 294f\n" + "293:" // Height 6: Full writeback + "str q8, [x9, #0x0]\n" + "str q9, [x9, #0x10]\n" + "str q10, [x9, #0x20]\n" + "str q11, [x9, #0x30]\n" + "add x9, x9, #0x40\n" + "str q12, [x26, #0x0]\n" + "str q13, [x26, #0x10]\n" + "str q14, [x26, #0x20]\n" + "str q15, [x26, #0x30]\n" + "str q16, [x25, #0x0]\n" + "str q17, [x25, #0x10]\n" + "str q18, [x25, #0x20]\n" + "str q19, [x25, #0x30]\n" + "str q20, [x24, #0x0]\n" + "str q21, [x24, #0x10]\n" + "str q22, [x24, #0x20]\n" + "str q23, [x24, #0x30]\n" + "str q24, [x23, #0x0]\n" + "str q25, [x23, #0x10]\n" + "str q26, [x23, #0x20]\n" + "str q27, [x23, #0x30]\n" + "str q28, [x22, #0x0]\n" + "str q29, [x22, #0x10]\n" + "str q30, [x22, #0x20]\n" + "str q31, [x22, #0x30]\n" + "294:" // Height 6: Writeback done + "subs x11, x11, #0x20\n" + "bgt 247b\n" + "subs %x[m], %x[m], #0x6\n" + "beq 296f\n" + "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "tbz %x[flags], #3, 295f\n" + "add x21, x21, #0x6\n" + "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" + "b 1b\n" + "295:" // Update direct input + "mov x20, #0xc\n" + "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" + "b 1b\n" + "296:" // Exit + : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) + : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), + [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), + [offsetof_N] "I"(offsetof(KernelArgs, N)), + [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), + [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), + [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), + [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), + [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), + [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} + +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h new file mode 100644 index 00000000..b1a0d809 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h @@ -0,0 +1,126 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#if !defined(__aarch64__) || !defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) || \ + !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#error This file must be compiled for AArch64, FEAT_FP16. +#else // Architectural features check. + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Micro-kernel dependencies +/// +/// -# kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets m step value. +/// +/// The starting row index must be divisible by `m_step`. +/// +/// @return The m step value. +size_t kai_get_m_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void); + +/// Gets n step value. +/// +/// The starting column index must be divisible by `n_step`. +/// +/// @return The n step value. +size_t kai_get_n_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void); + +/// Gets nr value. +/// +/// This is the packing parameter which must be used to pack the RHS matrix. +/// +/// @return The nr value. +size_t kai_get_nr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void); + +/// Gets kr value. +/// +/// This is the packing parameter which must be used to pack the RHS matrix. +/// +/// @return The kr value. +size_t kai_get_kr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void); + +/// Gets sr value. +/// +/// This is the packing parameter which must be used to pack the RHS matrix. +/// +/// @return The sr value. +size_t kai_get_sr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(void); + +/// Gets the offset in bytes to the data element in the LHS matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_lhs_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t m_idx, size_t stride); + +/// Gets the offset in bytes to the data element in the packed RHS matrix buffer. +/// +/// @param[in] n_idx Row index. +/// @param[in] k Number of columns. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t n_idx, size_t k); + +/// Gets the offset in bytes to the data element in the destination matrix buffer. +/// +/// @param[in] m_idx Row index. +/// @param[in] n_idx Column index. +/// @param[in] stride Row stride in bytes. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_dst_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla( + size_t m_idx, size_t n_idx, size_t stride); + +/// Gets the size in bytes of the destination matrix buffer. +/// +/// @param[in] m Number of rows. +/// @param[in] n Number of columns. +/// +/// @return The size in bytes of the destination matrix buffer. +size_t kai_get_dst_size_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla(size_t m, size_t n); + +/// Runs the matrix multiplication microkernel followed by a clamp operation. +/// +/// The pointer of each buffers (LHS, packed RHS and output) needs to be added with offset +/// calculated using the following functions: +/// +/// * LHS: @ref kai_get_lhs_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla. +/// * Packed RHS: @ref kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla. +/// * Output: @ref kai_get_dst_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla. +/// +/// @param[in] m Number of output rows to be computed. +/// @param[in] n Number of output columns to be computed. +/// @param[in] k Common dimension of the LHS and RHS operand. +/// @param[in] lhs LHS matrix buffer. +/// @param[in] lhs_stride Row stride in bytes of the LHS matrix. +/// @param[in] rhs_packed Packed RHS buffer. +/// @param[out] dst Output matrix buffer. +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(__fp16) +/// @param[in] clamp_min Minimum value to clamp the final result. +/// @param[in] clamp_max Maximum value to clamp the final result. +void kai_run_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla( + size_t m, size_t n, size_t k, // + const void* lhs, size_t lhs_stride, // + const void* rhs_packed, // + void* dst, size_t dst_stride_row, size_t dst_stride_col, // + __fp16 clamp_min, __fp16 clamp_max); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c new file mode 100644 index 00000000..fc27f8c9 --- /dev/null +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.c @@ -0,0 +1,296 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#if !defined(__aarch64__) +#error This file must be compiled for AArch64. +#else // Architectural features check. + +#include +#include + +#include "kai/kai_common.h" + +static const size_t kai_nr = 32; +static const size_t kai_kr = 1; + +size_t kai_get_n_step_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(void) { + return kai_nr; +} + +size_t kai_get_rhs_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx) { + KAI_ASSUME(n_idx % kai_nr == 0); + + return n_idx * sizeof(uint16_t); +} + +size_t kai_get_bias_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx) { + return n_idx * sizeof(uint16_t); +} + +size_t kai_get_rhs_packed_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx, size_t k) { + KAI_ASSUME(n_idx % kai_nr == 0); + + return n_idx * (sizeof(uint16_t) + k * sizeof(uint16_t)); +} + +size_t kai_get_rhs_packed_size_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n, size_t k) { + return kai_get_rhs_packed_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(kai_roundup(n, kai_nr), k); +} + +void kai_run_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params) { + KAI_ASSUME(num_groups == 1); + KAI_ASSUME(nr == kai_nr); + KAI_ASSUME(kr == kai_kr); + KAI_ASSUME(sr == 1); + KAI_ASSUME(rhs != NULL); + KAI_ASSUME(bias != NULL); + KAI_ASSUME(scale == NULL); + KAI_ASSUME(rhs_packed != NULL); + KAI_ASSUME(extra_bytes == 0); + KAI_ASSUME(params == NULL); + + size_t height = k; + const size_t width = n; + const void* in = rhs; + void* out = rhs_packed; + const size_t in_stride = rhs_stride; + size_t out_stride = kai_nr * height * sizeof(uint16_t) + kai_nr * sizeof(uint16_t); + + __asm__ __volatile__( + "mov x22, %x[width]\n" + "mov x21, %x[out]\n" + "cmp x22, #0x20\n" + "blt 2f\n" + "1:" // Bias: Full loop + "ldr q19, [%x[bias], #0x0]\n" + "ldr q18, [%x[bias], #0x10]\n" + "sub x22, x22, #0x20\n" + "ldr q17, [%x[bias], #0x20]\n" + "ldr q16, [%x[bias], #0x30]\n" + "cmp x22, #0x20\n" + "add %x[bias], %x[bias], #0x40\n" + "str q19, [x21, #0x0]\n" + "str q18, [x21, #0x10]\n" + "str q17, [x21, #0x20]\n" + "str q16, [x21, #0x30]\n" + "add x21, x21, %x[out_stride]\n" + "bge 1b\n" + "cbz x22, 3f\n" + "2:" // Bias: Tail loop + "ldr h20, [%x[bias], #0x0]\n" + "sub x22, x22, #0x1\n" + "add %x[bias], %x[bias], #0x2\n" + "cmp x22, #0x0\n" + "str h20, [x21]\n" + "add x21, x21, #0x2\n" + "bgt 2b\n" + "3:" // Bias: Done + "cmp %x[height], #0x4\n" + "add %x[out], %x[out], #0x40\n" + "blt 14f\n" + "4:" // Main row loop: Head + "mov x25, %x[in]\n" + "mov x24, %x[width]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x4\n" + "add x22, x25, %x[in_stride]\n" + "add x21, x22, %x[in_stride]\n" + "add x20, x21, %x[in_stride]\n" + "cmp x24, #0x20\n" + "add %x[in], x20, %x[in_stride]\n" + "blt 6f\n" + "5:" // Main row loop: Column loop + "ldr q31, [x25], #0x10\n" + "ldr q30, [x22], #0x10\n" + "sub x24, x24, #0x20\n" + "ldr q29, [x21], #0x10\n" + "ldr q28, [x20], #0x10\n" + "cmp x24, #0x20\n" + "ldr q27, [x25], #0x10\n" + "ldr q26, [x22], #0x10\n" + "ldr q25, [x21], #0x10\n" + "ldr q24, [x20], #0x10\n" + "ldr q23, [x25], #0x10\n" + "ldr q22, [x22], #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "ldr q19, [x25], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q31, [x23, #0x0]\n" + "str q27, [x23, #0x10]\n" + "str q23, [x23, #0x20]\n" + "str q19, [x23, #0x30]\n" + "str q30, [x23, #0x40]\n" + "str q26, [x23, #0x50]\n" + "str q22, [x23, #0x60]\n" + "str q18, [x23, #0x70]\n" + "str q29, [x23, #0x80]\n" + "str q25, [x23, #0x90]\n" + "str q21, [x23, #0xa0]\n" + "str q17, [x23, #0xb0]\n" + "str q28, [x23, #0xc0]\n" + "str q24, [x23, #0xd0]\n" + "str q20, [x23, #0xe0]\n" + "str q16, [x23, #0xf0]\n" + "add x23, x23, %x[out_stride]\n" + "bge 5b\n" + "6:" // Main row loop: Column loop skip + "cbz x24, 13f\n" + "cmp x24, #0x10\n" + "movi v16.8h, #0x0\n" + "str q16, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "str q16, [x23, #0x40]\n" + "str q16, [x23, #0x50]\n" + "str q16, [x23, #0x60]\n" + "str q16, [x23, #0x70]\n" + "str q16, [x23, #0x80]\n" + "str q16, [x23, #0x90]\n" + "str q16, [x23, #0xa0]\n" + "str q16, [x23, #0xb0]\n" + "str q16, [x23, #0xc0]\n" + "str q16, [x23, #0xd0]\n" + "str q16, [x23, #0xe0]\n" + "str q16, [x23, #0xf0]\n" + "blt 8f\n" + "7:" // Main row loop: width 16 loop: loop + "ldr q23, [x25], #0x10\n" + "ldr q22, [x22], #0x10\n" + "sub x24, x24, #0x10\n" + "ldr q21, [x21], #0x10\n" + "ldr q20, [x20], #0x10\n" + "cmp x24, #0x10\n" + "ldr q19, [x25], #0x10\n" + "ldr q18, [x22], #0x10\n" + "ldr q17, [x21], #0x10\n" + "ldr q16, [x20], #0x10\n" + "str q23, [x23, #0x0]\n" + "str q19, [x23, #0x10]\n" + "str q22, [x23, #0x40]\n" + "str q18, [x23, #0x50]\n" + "str q21, [x23, #0x80]\n" + "str q17, [x23, #0x90]\n" + "str q20, [x23, #0xc0]\n" + "str q16, [x23, #0xd0]\n" + "add x23, x23, #0x20\n" + "bge 7b\n" + "8:" // Main row loop: width 16 loop: skip + "cmp x24, #0x4\n" + "blt 10f\n" + "9:" // Main row loop: width 4 loop: loop + "ldr d19, [x25], #0x8\n" + "ldr d18, [x22], #0x8\n" + "sub x24, x24, #0x4\n" + "ldr d17, [x21], #0x8\n" + "ldr d16, [x20], #0x8\n" + "cmp x24, #0x4\n" + "str d19, [x23, #0x0]\n" + "str d18, [x23, #0x40]\n" + "str d17, [x23, #0x80]\n" + "str d16, [x23, #0xc0]\n" + "add x23, x23, #0x8\n" + "bge 9b\n" + "10:" // Main row loop: width 4 loop: skip + "cmp x24, #0x1\n" + "blt 12f\n" + "11:" // Main row loop: width 1 loop: loop + "ldr h19, [x25], #0x2\n" + "ldr h18, [x22], #0x2\n" + "sub x24, x24, #0x1\n" + "ldr h17, [x21], #0x2\n" + "ldr h16, [x20], #0x2\n" + "cmp x24, #0x1\n" + "str h19, [x23, #0x0]\n" + "str h18, [x23, #0x40]\n" + "str h17, [x23, #0x80]\n" + "str h16, [x23, #0xc0]\n" + "add x23, x23, #0x2\n" + "bge 11b\n" + "12:" // Main row loop: width 1 loop: skip + "13:" // Main row loop: odd col skip + "cmp %x[height], #0x4\n" + "add %x[out], %x[out], #0x100\n" + "bge 4b\n" + "cbz %x[height], 25f\n" + "14:" // Main loop skip + "15:" // Tail row loop: Head + "mov x20, %x[width]\n" + "mov x25, %x[in]\n" + "mov x23, %x[out]\n" + "sub %x[height], %x[height], #0x1\n" + "cmp x20, #0x20\n" + "add %x[in], x25, %x[in_stride]\n" + "blt 17f\n" + "16:" // Tail row loop: Column loop + "ldr q19, [x25], #0x10\n" + "sub x20, x20, #0x20\n" + "ldr q18, [x25], #0x10\n" + "ldr q17, [x25], #0x10\n" + "cmp x20, #0x20\n" + "ldr q16, [x25], #0x10\n" + "str q19, [x23, #0x0]\n" + "str q18, [x23, #0x10]\n" + "str q17, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "add x23, x23, %x[out_stride]\n" + "bge 16b\n" + "17:" // Tail row loop: Column loop skip + "cbz x20, 24f\n" + "cmp x20, #0x10\n" + "movi v16.8h, #0x0\n" + "str q16, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "str q16, [x23, #0x20]\n" + "str q16, [x23, #0x30]\n" + "blt 19f\n" + "18:" // Tail row loop: width 16 loop: loop + "ldr q17, [x25], #0x10\n" + "sub x20, x20, #0x10\n" + "ldr q16, [x25], #0x10\n" + "cmp x20, #0x10\n" + "str q17, [x23, #0x0]\n" + "str q16, [x23, #0x10]\n" + "add x23, x23, #0x20\n" + "bge 18b\n" + "19:" // Tail row loop: width 16 loop: skip + "cmp x20, #0x4\n" + "blt 21f\n" + "20:" // Tail row loop: width 4 loop: loop + "ldr d16, [x25], #0x8\n" + "sub x20, x20, #0x4\n" + "cmp x20, #0x4\n" + "str d16, [x23, #0x0]\n" + "add x23, x23, #0x8\n" + "bge 20b\n" + "21:" // Tail row loop: width 4 loop: skip + "cmp x20, #0x1\n" + "blt 23f\n" + "22:" // Tail row loop: width 1 loop: loop + "ldr h16, [x25], #0x2\n" + "sub x20, x20, #0x1\n" + "cmp x20, #0x1\n" + "str h16, [x23, #0x0]\n" + "add x23, x23, #0x2\n" + "bge 22b\n" + "23:" // Tail row loop: width 1 loop: skip + "24:" // Tail row loop: odd col skip + "cmp %x[height], #0x1\n" + "add %x[out], %x[out], #0x40\n" + "bge 15b\n" + "25:" // Done + : [bias] "+&r"(bias), [height] "+&r"(height), [in] "+&r"(in), [out] "+&r"(out) + : [in_stride] "r"(in_stride), [out_stride] "r"(out_stride), [width] "r"(width) + : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"); +} + +#endif // Architectural features check. diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h new file mode 100644 index 00000000..5f09a066 --- /dev/null +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h @@ -0,0 +1,80 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/// Gets n step value. +/// +/// The starting row index must be divisible by `n_step`. +/// +/// @return The n step value. +size_t kai_get_n_step_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(void); + +/// Gets the offset in bytes to the data element in the RHS matrix buffer. +/// +/// @param[in] n_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx); + +/// Gets the offset in bytes to the data element in the bias buffer. +/// +/// @param[in] n_idx Column index. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_bias_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx); + +/// Gets the offset in bytes to the data element in the packed RHS buffer. +/// +/// @param[in] n_idx Row index. +/// @param[in] k Number of columns. +/// +/// @return The offset in bytes to the data element. +size_t kai_get_rhs_packed_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n_idx, size_t k); + +/// Gets the size in bytes of the packed RHS buffer. +/// +/// @param[in] n Number of rows. +/// @param[in] k Number of columns. +/// +/// @return The size in bytes of the packed RHS buffer. +size_t kai_get_rhs_packed_size_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon(size_t n, size_t k); + +/// Runs the RHS packing function for matrix multiplication. +/// +/// The pointer of each buffers (RHS, bias and packed RHS) needs to be added with offset +/// calculated using the following functions: +/// +/// * RHS: @ref kai_get_rhs_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon. +/// * Bias: @ref kai_get_bias_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon. +/// * Output: @ref kai_get_rhs_packed_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon. +/// +/// @param[in] num_groups Number of groups. It must be 1. +/// @param[in] n Number of columns of the output matrix. +/// @param[in] k Common dimension between the LHS and RHS matrix. +/// @param[in] nr Block size in N dimension. It must be 32. +/// @param[in] kr Block size in K dimension. It must be 1. +/// @param[in] sr Number of kr splits. It must be 1. +/// @param[in] rhs_stride Row stride in bytes of the RHS matrix. +/// @param[in] rhs RHS matrix data buffer. +/// @param[in] bias Bias matrix data buffer. +/// @param[in] scale Scale data buffer. It must be NULL. +/// @param[out] rhs_packed Packed RHS matrix. +/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. It must be 0. +/// @param[in] params Extra packing parameters. It must be NULL. +void kai_run_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon( + size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs, + const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index f2d5e544..16ab3a25 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -36,6 +36,10 @@ #include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h" +// matmul_nt_nt_fp16_fp16_fp16_6x32_neon_mla +#include "kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla.h" +#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon.h" + // matmul_nt_nt_fp32_fp32_fp32_2vlx2vl_sme2_mopa #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h" #include "kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.h" @@ -386,6 +390,54 @@ static const std::array matmul_methods = { .fn_matmul_f32_f32p_f32p = nullptr, }, + MatMulMethod{ + .name = "matmul_nt_nt_fp16_fp16_fp16_6x32_neon_mla", + + .m0 = 6, + .n0 = 32, + + .lhs_transposed = false, + .rhs_transposed = false, + + .dst_format = DataFormat(DataType::FP16), + .lhs_format = DataFormat(DataType::FP16), + .packed_lhs_format = DataFormat(DataType::UNKNOWN), + .rhs_format = DataFormat(DataType::FP16), + .packed_rhs_format = DataFormat( + DataType::FP16, 32, 0, DataFormat::PackFormat::BIAS_PER_ROW, DataType::FP16, DataType::UNKNOWN, 32, 1), + .bias_format = DataFormat(DataType::FP16), + + .fn_is_supported = cpu_has_fp16, + .fn_get_mr = nullptr, + .fn_get_nr = kai_get_nr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_get_kr = kai_get_kr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_get_sr = kai_get_sr_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + + .fn_get_main_m_step = kai_get_m_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_get_pack_rhs_n_step = kai_get_n_step_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + .fn_get_main_n_step = kai_get_n_step_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + + .fn_get_lhs_offset = kai_get_lhs_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_get_packed_lhs_size = nullptr, + .fn_get_packed_lhs_offset = nullptr, + .fn_pack_lhs = nullptr, + + .fn_get_rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + .fn_get_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + .fn_get_pack_rhs_packed_rhs_offset = kai_get_rhs_packed_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + .fn_get_main_packed_rhs_offset = kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_pack_rhs = kai_run_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + + .fn_get_bias_offset = kai_get_bias_offset_rhs_pack_kxn_f16p32x1biasf16_f16_f16_neon, + + .fn_get_dst_offset = kai_get_dst_offset_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_get_dst_size = kai_get_dst_size_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + + .fn_matmul_f16_f16_f16p = kai_run_matmul_clamp_f16_f16_f16p32x1biasf16_6x32x8_neon_mla, + .fn_matmul_f32_f32_f32p = nullptr, + .fn_matmul_f32_f32p_f32p = nullptr, + }, + MatMulMethod{ .name = "matmul_nt_nt_fp32_fp32_fp32_6x8_neon_mla", -- GitLab