diff --git a/CMakeLists.txt b/CMakeLists.txt index 25a9afb8dd51ee33e60ee60a110f36759f83bb22..73293cd03d6768a8bcc8dcf0e8becbb97b20008e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,7 @@ set(KLEIDIAI_FILES_NEON kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c + kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.c diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index e3a07eef317c576bbcf74a3a9ccd0bfc5a1cd6ef..42a3f4418cd1a1d831279ef6dc41ffa61f6ddc51 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -40,6 +40,10 @@ NEON_KERNELS = [ "pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon", ] +NEON_KERNELS_ASM = [ + "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla", +] + # buildifier: keep sorted FP16_KERNELS = [ "matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla", @@ -145,6 +149,12 @@ kai_c_library( textual_hdrs = [ukernel + ".h" for ukernel in NEON_KERNELS], ) +kai_c_library( + name = "neon_impl_asm", + srcs = [ukernel + "_asm.S" for ukernel in NEON_KERNELS_ASM], + cpu_uarch = kai_cpu_neon(), +) + kai_c_library( name = "fp16_impl", srcs = [ukernel + ".c" for ukernel in FP16_KERNELS], @@ -205,6 +215,7 @@ kai_c_library( ":i8mm_impl", ":interface", ":neon_impl", + ":neon_impl_asm", ":scalar_impl", ":sme2_impl", ":sme_impl", diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c index fb380100906f6df59386a857d96b89a441ce7186..603d3f8fd7ea374e0aee16645f0ed38ab65796c6 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c @@ -1,10 +1,10 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // -#if !defined(__aarch64__) +#if (!defined(__aarch64__) && !defined(_M_ARM64)) #error This file must be compiled for AArch64. #else // Architectural features check. @@ -15,6 +15,24 @@ #include "kai/kai_common.h" +typedef struct { + float maxval; + float minval; + unsigned int num_strings; + const unsigned int* string_lengths; + size_t N; + const void* B_ptr; + size_t output_offset; + size_t input_initial_col; + size_t input_offset; + void* output_ptr; + const void* bias; +} kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t; + +extern void kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl( + const void* input_ptr, size_t m, kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t* args_ptr, + unsigned long flags); + static const size_t kai_mr = 6; static const size_t kai_nr = 8; static const size_t kai_kr = 1; @@ -72,21 +90,7 @@ void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla( float clamp_min, float clamp_max) { KAI_ASSERT(dst_stride_col == sizeof(float)); - typedef struct { - float maxval; - float minval; - unsigned int num_strings; - const unsigned int* string_lengths; - size_t N; - const void* B_ptr; - size_t output_offset; - size_t input_initial_col; - size_t input_offset; - void* output_ptr; - const void* bias; - } KernelArgs; - - KernelArgs ka; + kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl_args_t ka; unsigned long flags = 0; @@ -111,1940 +115,7 @@ void kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla( ka.maxval = clamp_max; ka.minval = clamp_min; - __asm__ __volatile__( - "1:" // Row loop - "cmp %x[m], #0x6\n" - "bge 126f\n" - "cmp %x[m], #0x4\n" - "bgt 101f\n" - "beq 76f\n" - "cmp %x[m], #0x2\n" - "bgt 51f\n" - "beq 26f\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "2:" // Height 1: Column loop - "cbz x10, 3f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "b 10f\n" - "3:" // Height 1: no bias - "tbz %x[flags], #0, 9f\n" - "cmp x11, #0x8\n" - "bge 8f\n" - "tbz x11, #2, 5f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "tbz x11, #1, 4f\n" - "ldr d21, [x9], #0x8\n" - "mov x20, #0x18\n" - "tbz x11, #0, 7f\n" - "ld1 { v21.s }[2], [x9]\n" - "b 7f\n" - "4:" // Height 1: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 7f\n" - "ldr s21, [x9, #0x0]\n" - "b 7f\n" - "5:" // Height 1: Partial accumulate: partial_2_0 - "tbz x11, #1, 6f\n" - "ldr d20, [x9], #0x8\n" - "mov x20, #0x8\n" - "tbz x11, #0, 7f\n" - "ld1 { v20.s }[2], [x9]\n" - "b 7f\n" - "6:" // Height 1: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "mov x20, #0x0\n" - "7:" // Height 1: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 10f\n" - "8:" // Height 1: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "b 10f\n" - "9:" // Height 1: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "10:" // Height 1: setup done - "mov x28, #0x0\n" - "11:" // Height 1: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 12f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "cbnz x28, 13f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "b 13f\n" - "12:" // Height 1: setup direct input - "mov x26, %x[input_ptr]\n" - "13:" // Height 1: input setup done - "cmp x27, #0x4\n" - "blt 16f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 15f\n" - "14:" // Height 1: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "cmp x27, #0x8\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 14b\n" - "15:" // Height 1: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "add x26, x26, #0x10\n" - "sub x27, x27, #0x4\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "16:" // Height 1: Multiply loop: Main loop skip - "cbz x27, 18f\n" - "17:" // Height 1: Multiply loop: Odd block loop - "ldr s18, [x26], #0x4\n" - "ldr q17, [x10, #0x0]\n" - "sub x27, x27, #0x1\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 17b\n" - "18:" // Height 1: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 11b\n" - "prfm pstl1keep, [x9, #0x0]\n" - "tbz %x[flags], #1, 19f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "19:" // Height 1: No activation - "cmp x11, #0x8\n" - "bge 24f\n" - "tbz x11, #2, 21f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "tbz x11, #1, 20f\n" - "str d21, [x9], #0x8\n" - "tbz x11, #0, 23f\n" - "st1 { v21.s }[2], [x9]\n" - "b 23f\n" - "20:" // Height 1: Partial direct writeback: partial_1_4 - "tbz x11, #0, 23f\n" - "str s21, [x9, #0x0]\n" - "b 23f\n" - "21:" // Height 1: Partial direct writeback: partial_2_0 - "tbz x11, #1, 22f\n" - "str d20, [x9], #0x8\n" - "tbz x11, #0, 23f\n" - "st1 { v20.s }[2], [x9]\n" - "b 23f\n" - "22:" // Height 1: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "23:" // Height 1: Partial direct writeback: Done - "b 25f\n" - "24:" // Height 1: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "25:" // Height 1: Writeback done - "subs x11, x11, #0x8\n" - "bgt 2b\n" - "b 152f\n" - "26:" // Height 2 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "27:" // Height 2: Column loop - "cbz x10, 28f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "mov v22.16b, v20.16b\n" - "mov v23.16b, v21.16b\n" - "b 35f\n" - "28:" // Height 2: no bias - "tbz %x[flags], #0, 34f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x8\n" - "add x26, x9, x20, LSL #2\n" - "bge 33f\n" - "tbz x11, #2, 30f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "ld1 { v22.4s }, [x26], #0x10\n" - "tbz x11, #1, 29f\n" - "ldr d21, [x9], #0x8\n" - "ldr d23, [x26], #0x8\n" - "mov x20, #0x18\n" - "tbz x11, #0, 32f\n" - "ld1 { v21.s }[2], [x9]\n" - "ld1 { v23.s }[2], [x26]\n" - "b 32f\n" - "29:" // Height 2: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 32f\n" - "ldr s21, [x9, #0x0]\n" - "ldr s23, [x26, #0x0]\n" - "b 32f\n" - "30:" // Height 2: Partial accumulate: partial_2_0 - "tbz x11, #1, 31f\n" - "ldr d20, [x9], #0x8\n" - "ldr d22, [x26], #0x8\n" - "mov x20, #0x8\n" - "tbz x11, #0, 32f\n" - "ld1 { v20.s }[2], [x9]\n" - "ld1 { v22.s }[2], [x26]\n" - "b 32f\n" - "31:" // Height 2: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "ldr s22, [x26, #0x0]\n" - "mov x20, #0x0\n" - "32:" // Height 2: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 35f\n" - "33:" // Height 2: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "ldr q22, [x26, #0x0]\n" - "ldr q23, [x26, #0x10]\n" - "b 35f\n" - "34:" // Height 2: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "35:" // Height 2: setup done - "mov x28, #0x0\n" - "36:" // Height 2: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 37f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "cbnz x28, 38f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "b 38f\n" - "37:" // Height 2: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #2\n" - "38:" // Height 2: input setup done - "cmp x27, #0x4\n" - "blt 41f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 40f\n" - "39:" // Height 2: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "add x25, x25, #0x10\n" - "cmp x27, #0x8\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "ldr q1, [x25, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 39b\n" - "40:" // Height 2: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x10, x10, #0x80\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "41:" // Height 2: Multiply loop: Main loop skip - "cbz x27, 43f\n" - "42:" // Height 2: Multiply loop: Odd block loop - "ldr s19, [x26], #0x4\n" - "ldr s18, [x25], #0x4\n" - "sub x27, x27, #0x1\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v19.s[0]\n" - "fmla v22.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v19.s[0]\n" - "fmla v23.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 42b\n" - "43:" // Height 2: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 36b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add x26, x9, x20, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "tbz %x[flags], #1, 44f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmin v22.4s, v22.4s, v17.4s\n" - "fmin v23.4s, v23.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "fmax v22.4s, v22.4s, v16.4s\n" - "fmax v23.4s, v23.4s, v16.4s\n" - "44:" // Height 2: No activation - "cmp x11, #0x8\n" - "bge 49f\n" - "tbz x11, #2, 46f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "st1 { v22.4s }, [x26], #0x10\n" - "tbz x11, #1, 45f\n" - "str d21, [x9], #0x8\n" - "str d23, [x26], #0x8\n" - "tbz x11, #0, 48f\n" - "st1 { v21.s }[2], [x9]\n" - "st1 { v23.s }[2], [x26]\n" - "b 48f\n" - "45:" // Height 2: Partial direct writeback: partial_1_4 - "tbz x11, #0, 48f\n" - "str s21, [x9, #0x0]\n" - "str s23, [x26, #0x0]\n" - "b 48f\n" - "46:" // Height 2: Partial direct writeback: partial_2_0 - "tbz x11, #1, 47f\n" - "str d20, [x9], #0x8\n" - "str d22, [x26], #0x8\n" - "tbz x11, #0, 48f\n" - "st1 { v20.s }[2], [x9]\n" - "st1 { v22.s }[2], [x26]\n" - "b 48f\n" - "47:" // Height 2: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "str s22, [x26, #0x0]\n" - "48:" // Height 2: Partial direct writeback: Done - "b 50f\n" - "49:" // Height 2: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "str q22, [x26, #0x0]\n" - "str q23, [x26, #0x10]\n" - "50:" // Height 2: Writeback done - "subs x11, x11, #0x8\n" - "bgt 27b\n" - "b 152f\n" - "51:" // Height 3 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "52:" // Height 3: Column loop - "cbz x10, 53f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "mov v22.16b, v20.16b\n" - "mov v23.16b, v21.16b\n" - "mov v24.16b, v20.16b\n" - "mov v25.16b, v21.16b\n" - "b 60f\n" - "53:" // Height 3: no bias - "tbz %x[flags], #0, 59f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x8\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "bge 58f\n" - "tbz x11, #2, 55f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "ld1 { v22.4s }, [x26], #0x10\n" - "ld1 { v24.4s }, [x25], #0x10\n" - "tbz x11, #1, 54f\n" - "ldr d21, [x9], #0x8\n" - "ldr d23, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d25, [x25], #0x8\n" - "tbz x11, #0, 57f\n" - "ld1 { v21.s }[2], [x9]\n" - "ld1 { v23.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "b 57f\n" - "54:" // Height 3: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 57f\n" - "ldr s21, [x9, #0x0]\n" - "ldr s23, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "b 57f\n" - "55:" // Height 3: Partial accumulate: partial_2_0 - "tbz x11, #1, 56f\n" - "ldr d20, [x9], #0x8\n" - "ldr d22, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d24, [x25], #0x8\n" - "tbz x11, #0, 57f\n" - "ld1 { v20.s }[2], [x9]\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "b 57f\n" - "56:" // Height 3: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "ldr s22, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s24, [x25, #0x0]\n" - "57:" // Height 3: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 60f\n" - "58:" // Height 3: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "ldr q22, [x26, #0x0]\n" - "ldr q23, [x26, #0x10]\n" - "ldr q24, [x25, #0x0]\n" - "ldr q25, [x25, #0x10]\n" - "b 60f\n" - "59:" // Height 3: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "60:" // Height 3: setup done - "mov x28, #0x0\n" - "61:" // Height 3: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 62f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "cbnz x28, 63f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "b 63f\n" - "62:" // Height 3: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #2\n" - "add x24, x25, x21, LSL #2\n" - "63:" // Height 3: input setup done - "cmp x27, #0x4\n" - "blt 66f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 65f\n" - "64:" // Height 3: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "cmp x27, #0x8\n" - "add x10, x10, #0x80\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q2, [x24, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 64b\n" - "65:" // Height 3: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "add x24, x24, #0x10\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "sub x27, x27, #0x4\n" - "prfm pldl1keep, [x25, #0x80]\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "66:" // Height 3: Multiply loop: Main loop skip - "cbz x27, 68f\n" - "67:" // Height 3: Multiply loop: Odd block loop - "ldr s26, [x26], #0x4\n" - "ldr s19, [x25], #0x4\n" - "sub x27, x27, #0x1\n" - "ldr s18, [x24], #0x4\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v26.s[0]\n" - "fmla v22.4s, v17.4s, v19.s[0]\n" - "fmla v24.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v26.s[0]\n" - "fmla v23.4s, v16.4s, v19.s[0]\n" - "fmla v25.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 67b\n" - "68:" // Height 3: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 61b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add x26, x9, x20, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x20, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "tbz %x[flags], #1, 69f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmin v22.4s, v22.4s, v17.4s\n" - "fmin v23.4s, v23.4s, v17.4s\n" - "fmin v24.4s, v24.4s, v17.4s\n" - "fmin v25.4s, v25.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "fmax v22.4s, v22.4s, v16.4s\n" - "fmax v23.4s, v23.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v16.4s\n" - "fmax v25.4s, v25.4s, v16.4s\n" - "69:" // Height 3: No activation - "cmp x11, #0x8\n" - "bge 74f\n" - "tbz x11, #2, 71f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "st1 { v22.4s }, [x26], #0x10\n" - "st1 { v24.4s }, [x25], #0x10\n" - "tbz x11, #1, 70f\n" - "str d21, [x9], #0x8\n" - "str d23, [x26], #0x8\n" - "str d25, [x25], #0x8\n" - "tbz x11, #0, 73f\n" - "st1 { v21.s }[2], [x9]\n" - "st1 { v23.s }[2], [x26]\n" - "st1 { v25.s }[2], [x25]\n" - "b 73f\n" - "70:" // Height 3: Partial direct writeback: partial_1_4 - "tbz x11, #0, 73f\n" - "str s21, [x9, #0x0]\n" - "str s23, [x26, #0x0]\n" - "str s25, [x25, #0x0]\n" - "b 73f\n" - "71:" // Height 3: Partial direct writeback: partial_2_0 - "tbz x11, #1, 72f\n" - "str d20, [x9], #0x8\n" - "str d22, [x26], #0x8\n" - "str d24, [x25], #0x8\n" - "tbz x11, #0, 73f\n" - "st1 { v20.s }[2], [x9]\n" - "st1 { v22.s }[2], [x26]\n" - "st1 { v24.s }[2], [x25]\n" - "b 73f\n" - "72:" // Height 3: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "str s22, [x26, #0x0]\n" - "str s24, [x25, #0x0]\n" - "73:" // Height 3: Partial direct writeback: Done - "b 75f\n" - "74:" // Height 3: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "str q22, [x26, #0x0]\n" - "str q23, [x26, #0x10]\n" - "str q24, [x25, #0x0]\n" - "str q25, [x25, #0x10]\n" - "75:" // Height 3: Writeback done - "subs x11, x11, #0x8\n" - "bgt 52b\n" - "b 152f\n" - "76:" // Height 4 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "77:" // Height 4: Column loop - "cbz x10, 78f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "mov v22.16b, v20.16b\n" - "mov v23.16b, v21.16b\n" - "mov v24.16b, v20.16b\n" - "mov v25.16b, v21.16b\n" - "mov v26.16b, v20.16b\n" - "mov v27.16b, v21.16b\n" - "b 85f\n" - "78:" // Height 4: no bias - "tbz %x[flags], #0, 84f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x8\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "bge 83f\n" - "tbz x11, #2, 80f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "ld1 { v22.4s }, [x26], #0x10\n" - "ld1 { v24.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x24], #0x10\n" - "tbz x11, #1, 79f\n" - "ldr d21, [x9], #0x8\n" - "ldr d23, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d25, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "tbz x11, #0, 82f\n" - "ld1 { v21.s }[2], [x9]\n" - "ld1 { v23.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x24]\n" - "b 82f\n" - "79:" // Height 4: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 82f\n" - "ldr s21, [x9, #0x0]\n" - "ldr s23, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "ldr s27, [x24, #0x0]\n" - "b 82f\n" - "80:" // Height 4: Partial accumulate: partial_2_0 - "tbz x11, #1, 81f\n" - "ldr d20, [x9], #0x8\n" - "ldr d22, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d26, [x24], #0x8\n" - "tbz x11, #0, 82f\n" - "ld1 { v20.s }[2], [x9]\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x24]\n" - "b 82f\n" - "81:" // Height 4: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "ldr s22, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s24, [x25, #0x0]\n" - "ldr s26, [x24, #0x0]\n" - "82:" // Height 4: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 85f\n" - "83:" // Height 4: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "ldr q22, [x26, #0x0]\n" - "ldr q23, [x26, #0x10]\n" - "ldr q24, [x25, #0x0]\n" - "ldr q25, [x25, #0x10]\n" - "ldr q26, [x24, #0x0]\n" - "ldr q27, [x24, #0x10]\n" - "b 85f\n" - "84:" // Height 4: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "85:" // Height 4: setup done - "mov x28, #0x0\n" - "86:" // Height 4: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 87f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "cbnz x28, 88f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "b 88f\n" - "87:" // Height 4: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #2\n" - "add x24, x25, x21, LSL #2\n" - "add x23, x24, x21, LSL #2\n" - "88:" // Height 4: input setup done - "cmp x27, #0x4\n" - "blt 91f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 90f\n" - "89:" // Height 4: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "add x23, x23, #0x10\n" - "cmp x27, #0x8\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q2, [x24, #0x0]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 89b\n" - "90:" // Height 4: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "add x10, x10, #0x80\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "91:" // Height 4: Multiply loop: Main loop skip - "cbz x27, 93f\n" - "92:" // Height 4: Multiply loop: Odd block loop - "ldr s29, [x26], #0x4\n" - "ldr s28, [x25], #0x4\n" - "sub x27, x27, #0x1\n" - "ldr s19, [x24], #0x4\n" - "ldr s18, [x23], #0x4\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v29.s[0]\n" - "fmla v22.4s, v17.4s, v28.s[0]\n" - "fmla v24.4s, v17.4s, v19.s[0]\n" - "fmla v26.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v29.s[0]\n" - "fmla v23.4s, v16.4s, v28.s[0]\n" - "fmla v25.4s, v16.4s, v19.s[0]\n" - "fmla v27.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 92b\n" - "93:" // Height 4: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 86b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add x26, x9, x20, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x20, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x20, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "tbz %x[flags], #1, 94f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmin v22.4s, v22.4s, v17.4s\n" - "fmin v23.4s, v23.4s, v17.4s\n" - "fmin v24.4s, v24.4s, v17.4s\n" - "fmin v25.4s, v25.4s, v17.4s\n" - "fmin v26.4s, v26.4s, v17.4s\n" - "fmin v27.4s, v27.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "fmax v22.4s, v22.4s, v16.4s\n" - "fmax v23.4s, v23.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v16.4s\n" - "fmax v25.4s, v25.4s, v16.4s\n" - "fmax v26.4s, v26.4s, v16.4s\n" - "fmax v27.4s, v27.4s, v16.4s\n" - "94:" // Height 4: No activation - "cmp x11, #0x8\n" - "bge 99f\n" - "tbz x11, #2, 96f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "st1 { v22.4s }, [x26], #0x10\n" - "st1 { v24.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x24], #0x10\n" - "tbz x11, #1, 95f\n" - "str d21, [x9], #0x8\n" - "str d23, [x26], #0x8\n" - "str d25, [x25], #0x8\n" - "str d27, [x24], #0x8\n" - "tbz x11, #0, 98f\n" - "st1 { v21.s }[2], [x9]\n" - "st1 { v23.s }[2], [x26]\n" - "st1 { v25.s }[2], [x25]\n" - "st1 { v27.s }[2], [x24]\n" - "b 98f\n" - "95:" // Height 4: Partial direct writeback: partial_1_4 - "tbz x11, #0, 98f\n" - "str s21, [x9, #0x0]\n" - "str s23, [x26, #0x0]\n" - "str s25, [x25, #0x0]\n" - "str s27, [x24, #0x0]\n" - "b 98f\n" - "96:" // Height 4: Partial direct writeback: partial_2_0 - "tbz x11, #1, 97f\n" - "str d20, [x9], #0x8\n" - "str d22, [x26], #0x8\n" - "str d24, [x25], #0x8\n" - "str d26, [x24], #0x8\n" - "tbz x11, #0, 98f\n" - "st1 { v20.s }[2], [x9]\n" - "st1 { v22.s }[2], [x26]\n" - "st1 { v24.s }[2], [x25]\n" - "st1 { v26.s }[2], [x24]\n" - "b 98f\n" - "97:" // Height 4: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "str s22, [x26, #0x0]\n" - "str s24, [x25, #0x0]\n" - "str s26, [x24, #0x0]\n" - "98:" // Height 4: Partial direct writeback: Done - "b 100f\n" - "99:" // Height 4: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "str q22, [x26, #0x0]\n" - "str q23, [x26, #0x10]\n" - "str q24, [x25, #0x0]\n" - "str q25, [x25, #0x10]\n" - "str q26, [x24, #0x0]\n" - "str q27, [x24, #0x10]\n" - "100:" // Height 4: Writeback done - "subs x11, x11, #0x8\n" - "bgt 77b\n" - "b 152f\n" - "101:" // Height 5 - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "102:" // Height 5: Column loop - "cbz x10, 103f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "mov v22.16b, v20.16b\n" - "mov v23.16b, v21.16b\n" - "mov v24.16b, v20.16b\n" - "mov v25.16b, v21.16b\n" - "mov v26.16b, v20.16b\n" - "mov v27.16b, v21.16b\n" - "mov v28.16b, v20.16b\n" - "mov v29.16b, v21.16b\n" - "b 110f\n" - "103:" // Height 5: no bias - "tbz %x[flags], #0, 109f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x8\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "bge 108f\n" - "tbz x11, #2, 105f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "ld1 { v22.4s }, [x26], #0x10\n" - "ld1 { v24.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x24], #0x10\n" - "ld1 { v28.4s }, [x23], #0x10\n" - "tbz x11, #1, 104f\n" - "ldr d21, [x9], #0x8\n" - "ldr d23, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d25, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d29, [x23], #0x8\n" - "tbz x11, #0, 107f\n" - "ld1 { v21.s }[2], [x9]\n" - "ld1 { v23.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x24]\n" - "ld1 { v29.s }[2], [x23]\n" - "b 107f\n" - "104:" // Height 5: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 107f\n" - "ldr s21, [x9, #0x0]\n" - "ldr s23, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "ldr s27, [x24, #0x0]\n" - "ldr s29, [x23, #0x0]\n" - "b 107f\n" - "105:" // Height 5: Partial accumulate: partial_2_0 - "tbz x11, #1, 106f\n" - "ldr d20, [x9], #0x8\n" - "ldr d22, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d26, [x24], #0x8\n" - "ldr d28, [x23], #0x8\n" - "tbz x11, #0, 107f\n" - "ld1 { v20.s }[2], [x9]\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x24]\n" - "ld1 { v28.s }[2], [x23]\n" - "b 107f\n" - "106:" // Height 5: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "ldr s22, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s24, [x25, #0x0]\n" - "ldr s26, [x24, #0x0]\n" - "ldr s28, [x23, #0x0]\n" - "107:" // Height 5: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 110f\n" - "108:" // Height 5: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "ldr q22, [x26, #0x0]\n" - "ldr q23, [x26, #0x10]\n" - "ldr q24, [x25, #0x0]\n" - "ldr q25, [x25, #0x10]\n" - "ldr q26, [x24, #0x0]\n" - "ldr q27, [x24, #0x10]\n" - "ldr q28, [x23, #0x0]\n" - "ldr q29, [x23, #0x10]\n" - "b 110f\n" - "109:" // Height 5: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "110:" // Height 5: setup done - "mov x28, #0x0\n" - "111:" // Height 5: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 112f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "cbnz x28, 113f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x22, x22, x20, LSL #2\n" - "b 113f\n" - "112:" // Height 5: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #2\n" - "add x24, x25, x21, LSL #2\n" - "add x23, x24, x21, LSL #2\n" - "add x22, x23, x21, LSL #2\n" - "113:" // Height 5: input setup done - "cmp x27, #0x4\n" - "blt 116f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 115f\n" - "114:" // Height 5: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v28.4s, v6.4s, v4.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "cmp x27, #0x8\n" - "add x10, x10, #0x80\n" - "ldr q6, [x10, #0x0]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "fmla v29.4s, v7.4s, v4.s[0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v28.4s, v8.4s, v4.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "fmla v29.4s, v9.4s, v4.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "fmla v28.4s, v10.4s, v4.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "fmla v29.4s, v11.4s, v4.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "fmla v28.4s, v12.4s, v4.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q2, [x24, #0x0]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "ldr q3, [x23, #0x0]\n" - "fmla v29.4s, v13.4s, v4.s[3]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 114b\n" - "115:" // Height 5: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "fmla v28.4s, v6.4s, v4.s[0]\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "add x22, x22, #0x10\n" - "sub x27, x27, #0x4\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "prfm pldl1keep, [x26, #0x80]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "fmla v29.4s, v7.4s, v4.s[0]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "add x10, x10, #0x80\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "fmla v28.4s, v8.4s, v4.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "fmla v29.4s, v9.4s, v4.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "fmla v28.4s, v10.4s, v4.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "fmla v29.4s, v11.4s, v4.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "fmla v28.4s, v12.4s, v4.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "fmla v29.4s, v13.4s, v4.s[3]\n" - "116:" // Height 5: Multiply loop: Main loop skip - "cbz x27, 118f\n" - "117:" // Height 5: Multiply loop: Odd block loop - "ldr s0, [x26], #0x4\n" - "ldr s31, [x25], #0x4\n" - "sub x27, x27, #0x1\n" - "ldr s30, [x24], #0x4\n" - "ldr s19, [x23], #0x4\n" - "ldr s18, [x22], #0x4\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v0.s[0]\n" - "fmla v22.4s, v17.4s, v31.s[0]\n" - "fmla v24.4s, v17.4s, v30.s[0]\n" - "fmla v26.4s, v17.4s, v19.s[0]\n" - "fmla v28.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v0.s[0]\n" - "fmla v23.4s, v16.4s, v31.s[0]\n" - "fmla v25.4s, v16.4s, v30.s[0]\n" - "fmla v27.4s, v16.4s, v19.s[0]\n" - "fmla v29.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 117b\n" - "118:" // Height 5: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 111b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add x26, x9, x20, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x20, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x20, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20, LSL #2\n" - "prfm pstl1keep, [x23, #0x0]\n" - "tbz %x[flags], #1, 119f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmin v22.4s, v22.4s, v17.4s\n" - "fmin v23.4s, v23.4s, v17.4s\n" - "fmin v24.4s, v24.4s, v17.4s\n" - "fmin v25.4s, v25.4s, v17.4s\n" - "fmin v26.4s, v26.4s, v17.4s\n" - "fmin v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v17.4s\n" - "fmin v29.4s, v29.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "fmax v22.4s, v22.4s, v16.4s\n" - "fmax v23.4s, v23.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v16.4s\n" - "fmax v25.4s, v25.4s, v16.4s\n" - "fmax v26.4s, v26.4s, v16.4s\n" - "fmax v27.4s, v27.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v16.4s\n" - "fmax v29.4s, v29.4s, v16.4s\n" - "119:" // Height 5: No activation - "cmp x11, #0x8\n" - "bge 124f\n" - "tbz x11, #2, 121f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "st1 { v22.4s }, [x26], #0x10\n" - "st1 { v24.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x24], #0x10\n" - "st1 { v28.4s }, [x23], #0x10\n" - "tbz x11, #1, 120f\n" - "str d21, [x9], #0x8\n" - "str d23, [x26], #0x8\n" - "str d25, [x25], #0x8\n" - "str d27, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "tbz x11, #0, 123f\n" - "st1 { v21.s }[2], [x9]\n" - "st1 { v23.s }[2], [x26]\n" - "st1 { v25.s }[2], [x25]\n" - "st1 { v27.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" - "b 123f\n" - "120:" // Height 5: Partial direct writeback: partial_1_4 - "tbz x11, #0, 123f\n" - "str s21, [x9, #0x0]\n" - "str s23, [x26, #0x0]\n" - "str s25, [x25, #0x0]\n" - "str s27, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" - "b 123f\n" - "121:" // Height 5: Partial direct writeback: partial_2_0 - "tbz x11, #1, 122f\n" - "str d20, [x9], #0x8\n" - "str d22, [x26], #0x8\n" - "str d24, [x25], #0x8\n" - "str d26, [x24], #0x8\n" - "str d28, [x23], #0x8\n" - "tbz x11, #0, 123f\n" - "st1 { v20.s }[2], [x9]\n" - "st1 { v22.s }[2], [x26]\n" - "st1 { v24.s }[2], [x25]\n" - "st1 { v26.s }[2], [x24]\n" - "st1 { v28.s }[2], [x23]\n" - "b 123f\n" - "122:" // Height 5: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "str s22, [x26, #0x0]\n" - "str s24, [x25, #0x0]\n" - "str s26, [x24, #0x0]\n" - "str s28, [x23, #0x0]\n" - "123:" // Height 5: Partial direct writeback: Done - "b 125f\n" - "124:" // Height 5: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "str q22, [x26, #0x0]\n" - "str q23, [x26, #0x10]\n" - "str q24, [x25, #0x0]\n" - "str q25, [x25, #0x10]\n" - "str q26, [x24, #0x0]\n" - "str q27, [x24, #0x10]\n" - "str q28, [x23, #0x0]\n" - "str q29, [x23, #0x10]\n" - "125:" // Height 5: Writeback done - "subs x11, x11, #0x8\n" - "bgt 102b\n" - "b 152f\n" - "126:" // Height 6 - "ldr x21, [%x[args_ptr], %[offsetof_output_offset]]\n" - "ldr x9, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "mov x20, #0x18\n" - "ldr x11, [%x[args_ptr], %[offsetof_N]]\n" - "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n" - "madd x20, x21, x20, x9\n" - "str x20, [%x[args_ptr], %[offsetof_output_ptr]]\n" - "127:" // Height 6: Column loop - "cbz x10, 128f\n" - "ldr q20, [x10, #0x0]\n" - "ldr q21, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "mov v22.16b, v20.16b\n" - "mov v23.16b, v21.16b\n" - "mov v24.16b, v20.16b\n" - "mov v25.16b, v21.16b\n" - "mov v26.16b, v20.16b\n" - "mov v27.16b, v21.16b\n" - "mov v28.16b, v20.16b\n" - "mov v29.16b, v21.16b\n" - "mov v30.16b, v20.16b\n" - "mov v31.16b, v21.16b\n" - "b 135f\n" - "128:" // Height 6: no bias - "tbz %x[flags], #0, 134f\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "cmp x11, #0x8\n" - "add x26, x9, x20, LSL #2\n" - "add x25, x26, x20, LSL #2\n" - "add x24, x25, x20, LSL #2\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "bge 133f\n" - "tbz x11, #2, 130f\n" - "ld1 { v20.4s }, [x9], #0x10\n" - "ld1 { v22.4s }, [x26], #0x10\n" - "ld1 { v24.4s }, [x25], #0x10\n" - "ld1 { v26.4s }, [x24], #0x10\n" - "ld1 { v28.4s }, [x23], #0x10\n" - "ld1 { v30.4s }, [x22], #0x10\n" - "tbz x11, #1, 129f\n" - "ldr d21, [x9], #0x8\n" - "ldr d23, [x26], #0x8\n" - "mov x20, #0x18\n" - "ldr d25, [x25], #0x8\n" - "ldr d27, [x24], #0x8\n" - "ldr d29, [x23], #0x8\n" - "ldr d31, [x22], #0x8\n" - "tbz x11, #0, 132f\n" - "ld1 { v21.s }[2], [x9]\n" - "ld1 { v23.s }[2], [x26]\n" - "ld1 { v25.s }[2], [x25]\n" - "ld1 { v27.s }[2], [x24]\n" - "ld1 { v29.s }[2], [x23]\n" - "ld1 { v31.s }[2], [x22]\n" - "b 132f\n" - "129:" // Height 6: Partial accumulate: partial_1_4 - "mov x20, #0x10\n" - "tbz x11, #0, 132f\n" - "ldr s21, [x9, #0x0]\n" - "ldr s23, [x26, #0x0]\n" - "ldr s25, [x25, #0x0]\n" - "ldr s27, [x24, #0x0]\n" - "ldr s29, [x23, #0x0]\n" - "ldr s31, [x22, #0x0]\n" - "b 132f\n" - "130:" // Height 6: Partial accumulate: partial_2_0 - "tbz x11, #1, 131f\n" - "ldr d20, [x9], #0x8\n" - "ldr d22, [x26], #0x8\n" - "mov x20, #0x8\n" - "ldr d24, [x25], #0x8\n" - "ldr d26, [x24], #0x8\n" - "ldr d28, [x23], #0x8\n" - "ldr d30, [x22], #0x8\n" - "tbz x11, #0, 132f\n" - "ld1 { v20.s }[2], [x9]\n" - "ld1 { v22.s }[2], [x26]\n" - "ld1 { v24.s }[2], [x25]\n" - "ld1 { v26.s }[2], [x24]\n" - "ld1 { v28.s }[2], [x23]\n" - "ld1 { v30.s }[2], [x22]\n" - "b 132f\n" - "131:" // Height 6: Partial accumulate: partial_1_0 - "ldr s20, [x9, #0x0]\n" - "ldr s22, [x26, #0x0]\n" - "mov x20, #0x0\n" - "ldr s24, [x25, #0x0]\n" - "ldr s26, [x24, #0x0]\n" - "ldr s28, [x23, #0x0]\n" - "ldr s30, [x22, #0x0]\n" - "132:" // Height 6: Partial accumulate: Done - "sub x9, x9, x20\n" - "b 135f\n" - "133:" // Height 6: full accumulate - "ldr q20, [x9, #0x0]\n" - "ldr q21, [x9, #0x10]\n" - "ldr q22, [x26, #0x0]\n" - "ldr q23, [x26, #0x10]\n" - "ldr q24, [x25, #0x0]\n" - "ldr q25, [x25, #0x10]\n" - "ldr q26, [x24, #0x0]\n" - "ldr q27, [x24, #0x10]\n" - "ldr q28, [x23, #0x0]\n" - "ldr q29, [x23, #0x10]\n" - "ldr q30, [x22, #0x0]\n" - "ldr q31, [x22, #0x10]\n" - "b 135f\n" - "134:" // Height 6: no accumulate - "movi v20.16b, #0x0\n" - "movi v21.16b, #0x0\n" - "movi v22.16b, #0x0\n" - "movi v23.16b, #0x0\n" - "movi v24.16b, #0x0\n" - "movi v25.16b, #0x0\n" - "movi v26.16b, #0x0\n" - "movi v27.16b, #0x0\n" - "movi v28.16b, #0x0\n" - "movi v29.16b, #0x0\n" - "movi v30.16b, #0x0\n" - "movi v31.16b, #0x0\n" - "135:" // Height 6: setup done - "mov x28, #0x0\n" - "136:" // Height 6: String loop - "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "ldr w27, [x20, x28, LSL #0x2]\n" - "tbz %x[flags], #3, 137f\n" - "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n" - "add x20, x20, x21, LSL #3\n" - "ldr x26, [x20, #0x0]\n" - "ldr x25, [x20, #0x8]\n" - "ldr x24, [x20, #0x10]\n" - "ldr x23, [x20, #0x18]\n" - "ldr x22, [x20, #0x20]\n" - "ldr x21, [x20, #0x28]\n" - "cbnz x28, 138f\n" - "ldr x20, [%x[args_ptr], %[offsetof_input_initial_col]]\n" - "add x26, x26, x20, LSL #2\n" - "add x25, x25, x20, LSL #2\n" - "add x24, x24, x20, LSL #2\n" - "add x23, x23, x20, LSL #2\n" - "add x22, x22, x20, LSL #2\n" - "add x21, x21, x20, LSL #2\n" - "b 138f\n" - "137:" // Height 6: setup direct input - "mov x26, %x[input_ptr]\n" - "add x25, x26, x21, LSL #2\n" - "add x24, x25, x21, LSL #2\n" - "add x23, x24, x21, LSL #2\n" - "add x22, x23, x21, LSL #2\n" - "add x21, x22, x21, LSL #2\n" - "138:" // Height 6: input setup done - "cmp x27, #0x4\n" - "blt 141f\n" - "ldr q0, [x26, #0x0]\n" - "ldr q1, [x25, #0x0]\n" - "cmp x27, #0x8\n" - "ldr q2, [x24, #0x0]\n" - "ldr q3, [x23, #0x0]\n" - "ldr q4, [x22, #0x0]\n" - "ldr q5, [x21, #0x0]\n" - "ldr q6, [x10, #0x0]\n" - "ldr q7, [x10, #0x10]\n" - "ldr q8, [x10, #0x20]\n" - "ldr q9, [x10, #0x30]\n" - "ldr q10, [x10, #0x40]\n" - "ldr q11, [x10, #0x50]\n" - "ldr q12, [x10, #0x60]\n" - "ldr q13, [x10, #0x70]\n" - "blt 140f\n" - "139:" // Height 6: Multiply loop: Main loop head - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "add x26, x26, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x25, x25, #0x10\n" - "add x24, x24, #0x10\n" - "fmla v28.4s, v6.4s, v4.s[0]\n" - "fmla v30.4s, v6.4s, v5.s[0]\n" - "add x23, x23, #0x10\n" - "add x22, x22, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "add x21, x21, #0x10\n" - "cmp x27, #0x8\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "add x10, x10, #0x80\n" - "prfm pldl1keep, [x26, #0x80]\n" - "ldr q6, [x10, #0x0]\n" - "fmla v29.4s, v7.4s, v4.s[0]\n" - "fmla v31.4s, v7.4s, v5.s[0]\n" - "ldr q7, [x10, #0x10]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v28.4s, v8.4s, v4.s[1]\n" - "fmla v30.4s, v8.4s, v5.s[1]\n" - "ldr q8, [x10, #0x20]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "fmla v29.4s, v9.4s, v4.s[1]\n" - "fmla v31.4s, v9.4s, v5.s[1]\n" - "ldr q9, [x10, #0x30]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "fmla v28.4s, v10.4s, v4.s[2]\n" - "fmla v30.4s, v10.4s, v5.s[2]\n" - "ldr q10, [x10, #0x40]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "fmla v29.4s, v11.4s, v4.s[2]\n" - "fmla v31.4s, v11.4s, v5.s[2]\n" - "ldr q11, [x10, #0x50]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "fmla v28.4s, v12.4s, v4.s[3]\n" - "fmla v30.4s, v12.4s, v5.s[3]\n" - "ldr q12, [x10, #0x60]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "ldr q0, [x26, #0x0]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "ldr q1, [x25, #0x0]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "ldr q2, [x24, #0x0]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "ldr q3, [x23, #0x0]\n" - "fmla v29.4s, v13.4s, v4.s[3]\n" - "ldr q4, [x22, #0x0]\n" - "fmla v31.4s, v13.4s, v5.s[3]\n" - "ldr q5, [x21, #0x0]\n" - "ldr q13, [x10, #0x70]\n" - "bge 139b\n" - "140:" // Height 6: Multiply loop: Single iteration only - "fmla v20.4s, v6.4s, v0.s[0]\n" - "fmla v22.4s, v6.4s, v1.s[0]\n" - "add x26, x26, #0x10\n" - "add x25, x25, #0x10\n" - "fmla v24.4s, v6.4s, v2.s[0]\n" - "fmla v26.4s, v6.4s, v3.s[0]\n" - "add x24, x24, #0x10\n" - "add x23, x23, #0x10\n" - "fmla v28.4s, v6.4s, v4.s[0]\n" - "fmla v30.4s, v6.4s, v5.s[0]\n" - "add x22, x22, #0x10\n" - "add x21, x21, #0x10\n" - "fmla v21.4s, v7.4s, v0.s[0]\n" - "fmla v23.4s, v7.4s, v1.s[0]\n" - "sub x27, x27, #0x4\n" - "prfm pldl1keep, [x26, #0x80]\n" - "fmla v25.4s, v7.4s, v2.s[0]\n" - "fmla v27.4s, v7.4s, v3.s[0]\n" - "prfm pldl1keep, [x25, #0x80]\n" - "prfm pldl1keep, [x24, #0x80]\n" - "fmla v29.4s, v7.4s, v4.s[0]\n" - "fmla v31.4s, v7.4s, v5.s[0]\n" - "prfm pldl1keep, [x23, #0x80]\n" - "prfm pldl1keep, [x22, #0x80]\n" - "fmla v20.4s, v8.4s, v0.s[1]\n" - "fmla v22.4s, v8.4s, v1.s[1]\n" - "prfm pldl1keep, [x21, #0x80]\n" - "add x10, x10, #0x80\n" - "fmla v24.4s, v8.4s, v2.s[1]\n" - "fmla v26.4s, v8.4s, v3.s[1]\n" - "fmla v28.4s, v8.4s, v4.s[1]\n" - "fmla v30.4s, v8.4s, v5.s[1]\n" - "fmla v21.4s, v9.4s, v0.s[1]\n" - "fmla v23.4s, v9.4s, v1.s[1]\n" - "fmla v25.4s, v9.4s, v2.s[1]\n" - "fmla v27.4s, v9.4s, v3.s[1]\n" - "fmla v29.4s, v9.4s, v4.s[1]\n" - "fmla v31.4s, v9.4s, v5.s[1]\n" - "fmla v20.4s, v10.4s, v0.s[2]\n" - "fmla v22.4s, v10.4s, v1.s[2]\n" - "fmla v24.4s, v10.4s, v2.s[2]\n" - "fmla v26.4s, v10.4s, v3.s[2]\n" - "fmla v28.4s, v10.4s, v4.s[2]\n" - "fmla v30.4s, v10.4s, v5.s[2]\n" - "fmla v21.4s, v11.4s, v0.s[2]\n" - "fmla v23.4s, v11.4s, v1.s[2]\n" - "fmla v25.4s, v11.4s, v2.s[2]\n" - "fmla v27.4s, v11.4s, v3.s[2]\n" - "fmla v29.4s, v11.4s, v4.s[2]\n" - "fmla v31.4s, v11.4s, v5.s[2]\n" - "fmla v20.4s, v12.4s, v0.s[3]\n" - "fmla v22.4s, v12.4s, v1.s[3]\n" - "fmla v24.4s, v12.4s, v2.s[3]\n" - "fmla v26.4s, v12.4s, v3.s[3]\n" - "fmla v28.4s, v12.4s, v4.s[3]\n" - "fmla v30.4s, v12.4s, v5.s[3]\n" - "fmla v21.4s, v13.4s, v0.s[3]\n" - "fmla v23.4s, v13.4s, v1.s[3]\n" - "fmla v25.4s, v13.4s, v2.s[3]\n" - "fmla v27.4s, v13.4s, v3.s[3]\n" - "fmla v29.4s, v13.4s, v4.s[3]\n" - "fmla v31.4s, v13.4s, v5.s[3]\n" - "141:" // Height 6: Multiply loop: Main loop skip - "cbz x27, 143f\n" - "142:" // Height 6: Multiply loop: Odd block loop - "ldr s3, [x26], #0x4\n" - "ldr s2, [x25], #0x4\n" - "sub x27, x27, #0x1\n" - "ldr s1, [x24], #0x4\n" - "ldr s0, [x23], #0x4\n" - "ldr s19, [x22], #0x4\n" - "ldr s18, [x21], #0x4\n" - "ldr q17, [x10, #0x0]\n" - "ldr q16, [x10, #0x10]\n" - "add x10, x10, #0x20\n" - "fmla v20.4s, v17.4s, v3.s[0]\n" - "fmla v22.4s, v17.4s, v2.s[0]\n" - "fmla v24.4s, v17.4s, v1.s[0]\n" - "fmla v26.4s, v17.4s, v0.s[0]\n" - "fmla v28.4s, v17.4s, v19.s[0]\n" - "fmla v30.4s, v17.4s, v18.s[0]\n" - "fmla v21.4s, v16.4s, v3.s[0]\n" - "fmla v23.4s, v16.4s, v2.s[0]\n" - "fmla v25.4s, v16.4s, v1.s[0]\n" - "fmla v27.4s, v16.4s, v0.s[0]\n" - "fmla v29.4s, v16.4s, v19.s[0]\n" - "fmla v31.4s, v16.4s, v18.s[0]\n" - "cbnz x27, 142b\n" - "143:" // Height 6: Multiply loop: No odd multiplies - "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n" - "add x28, x28, #0x1\n" - "cmp x28, x20\n" - "bne 136b\n" - "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n" - "prfm pstl1keep, [x9, #0x0]\n" - "add x26, x9, x20, LSL #2\n" - "prfm pstl1keep, [x26, #0x0]\n" - "add x25, x26, x20, LSL #2\n" - "prfm pstl1keep, [x25, #0x0]\n" - "add x24, x25, x20, LSL #2\n" - "prfm pstl1keep, [x24, #0x0]\n" - "add x23, x24, x20, LSL #2\n" - "add x22, x23, x20, LSL #2\n" - "prfm pstl1keep, [x23, #0x0]\n" - "prfm pstl1keep, [x22, #0x0]\n" - "tbz %x[flags], #1, 144f\n" - "add x21, %x[args_ptr], %[offset_max]\n" - "add x20, %x[args_ptr], %[offset_min]\n" - "ld1r { v17.4s }, [x21]\n" - "ld1r { v16.4s }, [x20]\n" - "fmin v20.4s, v20.4s, v17.4s\n" - "fmin v21.4s, v21.4s, v17.4s\n" - "fmin v22.4s, v22.4s, v17.4s\n" - "fmin v23.4s, v23.4s, v17.4s\n" - "fmin v24.4s, v24.4s, v17.4s\n" - "fmin v25.4s, v25.4s, v17.4s\n" - "fmin v26.4s, v26.4s, v17.4s\n" - "fmin v27.4s, v27.4s, v17.4s\n" - "fmin v28.4s, v28.4s, v17.4s\n" - "fmin v29.4s, v29.4s, v17.4s\n" - "fmin v30.4s, v30.4s, v17.4s\n" - "fmin v31.4s, v31.4s, v17.4s\n" - "fmax v20.4s, v20.4s, v16.4s\n" - "fmax v21.4s, v21.4s, v16.4s\n" - "fmax v22.4s, v22.4s, v16.4s\n" - "fmax v23.4s, v23.4s, v16.4s\n" - "fmax v24.4s, v24.4s, v16.4s\n" - "fmax v25.4s, v25.4s, v16.4s\n" - "fmax v26.4s, v26.4s, v16.4s\n" - "fmax v27.4s, v27.4s, v16.4s\n" - "fmax v28.4s, v28.4s, v16.4s\n" - "fmax v29.4s, v29.4s, v16.4s\n" - "fmax v30.4s, v30.4s, v16.4s\n" - "fmax v31.4s, v31.4s, v16.4s\n" - "144:" // Height 6: No activation - "cmp x11, #0x8\n" - "bge 149f\n" - "tbz x11, #2, 146f\n" - "st1 { v20.4s }, [x9], #0x10\n" - "st1 { v22.4s }, [x26], #0x10\n" - "st1 { v24.4s }, [x25], #0x10\n" - "st1 { v26.4s }, [x24], #0x10\n" - "st1 { v28.4s }, [x23], #0x10\n" - "st1 { v30.4s }, [x22], #0x10\n" - "tbz x11, #1, 145f\n" - "str d21, [x9], #0x8\n" - "str d23, [x26], #0x8\n" - "str d25, [x25], #0x8\n" - "str d27, [x24], #0x8\n" - "str d29, [x23], #0x8\n" - "str d31, [x22], #0x8\n" - "tbz x11, #0, 148f\n" - "st1 { v21.s }[2], [x9]\n" - "st1 { v23.s }[2], [x26]\n" - "st1 { v25.s }[2], [x25]\n" - "st1 { v27.s }[2], [x24]\n" - "st1 { v29.s }[2], [x23]\n" - "st1 { v31.s }[2], [x22]\n" - "b 148f\n" - "145:" // Height 6: Partial direct writeback: partial_1_4 - "tbz x11, #0, 148f\n" - "str s21, [x9, #0x0]\n" - "str s23, [x26, #0x0]\n" - "str s25, [x25, #0x0]\n" - "str s27, [x24, #0x0]\n" - "str s29, [x23, #0x0]\n" - "str s31, [x22, #0x0]\n" - "b 148f\n" - "146:" // Height 6: Partial direct writeback: partial_2_0 - "tbz x11, #1, 147f\n" - "str d20, [x9], #0x8\n" - "str d22, [x26], #0x8\n" - "str d24, [x25], #0x8\n" - "str d26, [x24], #0x8\n" - "str d28, [x23], #0x8\n" - "str d30, [x22], #0x8\n" - "tbz x11, #0, 148f\n" - "st1 { v20.s }[2], [x9]\n" - "st1 { v22.s }[2], [x26]\n" - "st1 { v24.s }[2], [x25]\n" - "st1 { v26.s }[2], [x24]\n" - "st1 { v28.s }[2], [x23]\n" - "st1 { v30.s }[2], [x22]\n" - "b 148f\n" - "147:" // Height 6: Partial direct writeback: partial_1_0 - "str s20, [x9, #0x0]\n" - "str s22, [x26, #0x0]\n" - "str s24, [x25, #0x0]\n" - "str s26, [x24, #0x0]\n" - "str s28, [x23, #0x0]\n" - "str s30, [x22, #0x0]\n" - "148:" // Height 6: Partial direct writeback: Done - "b 150f\n" - "149:" // Height 6: Full writeback - "str q20, [x9, #0x0]\n" - "str q21, [x9, #0x10]\n" - "add x9, x9, #0x20\n" - "str q22, [x26, #0x0]\n" - "str q23, [x26, #0x10]\n" - "str q24, [x25, #0x0]\n" - "str q25, [x25, #0x10]\n" - "str q26, [x24, #0x0]\n" - "str q27, [x24, #0x10]\n" - "str q28, [x23, #0x0]\n" - "str q29, [x23, #0x10]\n" - "str q30, [x22, #0x0]\n" - "str q31, [x22, #0x10]\n" - "150:" // Height 6: Writeback done - "subs x11, x11, #0x8\n" - "bgt 127b\n" - "subs %x[m], %x[m], #0x6\n" - "beq 152f\n" - "ldr x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "tbz %x[flags], #3, 151f\n" - "add x21, x21, #0x6\n" - "str x21, [%x[args_ptr], %[offsetof_input_offset]]\n" - "b 1b\n" - "151:" // Update direct input - "mov x20, #0x18\n" - "madd %x[input_ptr], x20, x21, %x[input_ptr]\n" - "b 1b\n" - "152:" // Exit - : [input_ptr] "+&r"(input_ptr), [m] "+&r"(m) - : [args_ptr] "r"(&ka), [flags] "r"(flags), [offset_max] "I"(offsetof(KernelArgs, maxval)), - [offset_min] "I"(offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I"(offsetof(KernelArgs, B_ptr)), - [offsetof_N] "I"(offsetof(KernelArgs, N)), - [offsetof_input_initial_col] "I"(offsetof(KernelArgs, input_initial_col)), - [offsetof_input_offset] "I"(offsetof(KernelArgs, input_offset)), - [offsetof_num_strings] "I"(offsetof(KernelArgs, num_strings)), - [offsetof_output_offset] "I"(offsetof(KernelArgs, output_offset)), - [offsetof_output_ptr] "I"(offsetof(KernelArgs, output_ptr)), - [offsetof_string_lengths] "I"(offsetof(KernelArgs, string_lengths)) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", - "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl(input_ptr, m, &ka, flags); } #endif // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h index 0a733f1eb674b0609cfde6efafc59776fa9053ea..492170029f275d71c6ddc55838e9d6cc4c6c81bb 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h @@ -1,12 +1,12 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // #pragma once -#if !defined(__aarch64__) +#if (!defined(__aarch64__) && !defined(_M_ARM64)) #error This file must be compiled for AArch64. #else // Architectural features check. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S new file mode 100644 index 0000000000000000000000000000000000000000..959c58c03c4f4751f253b6212be3520d5e5e0410 --- /dev/null +++ b/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S @@ -0,0 +1,1983 @@ +// +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef _MSC_VER + +#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 +#define KAI_ASM_LABEL(label) |label| +#define KAI_ASM_FUNCTION(label) |label| +#define KAI_ASM_EXPORT(label) global label +#define KAI_ASM_FOOTER end +#define KAI_ASM_INST(num) dcd num + +#else // _MSC_VER + +#define KAI_ASM_HEADER .text +#define KAI_ASM_LABEL(label) label: + +#ifdef __APPLE__ +#define KAI_ASM_FUNCTION(label) _##label: +#define KAI_ASM_EXPORT(label) \ + .global _##label; \ + .type _##label, %function +#else // __APPLE__ +#define KAI_ASM_FUNCTION(label) label: +#define KAI_ASM_EXPORT(label) \ + .global label; \ + .type label, %function +#endif // __APPLE__ + +#define KAI_ASM_FOOTER +#define KAI_ASM_INST(num) .inst num + +#endif // _MSC_VER + + KAI_ASM_HEADER + + KAI_ASM_EXPORT(kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl) + +KAI_ASM_FUNCTION(kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_impl) + stp x20, x21, [sp, -144]! + stp x22, x23, [sp, 16] + stp x24, x25, [sp, 32] + stp x26, x27, [sp, 48] + str x28, [sp, 64] + stp d10, d11, [sp, 72] + stp d12, d13, [sp, 88] + stp d14, d15, [sp, 104] + stp d8, d9, [sp, 120] +KAI_ASM_LABEL(label_1) // Row loop + cmp x1, #0x6 + bge label_126 + cmp x1, #0x4 + bgt label_101 + beq label_76 + cmp x1, #0x2 + bgt label_51 + beq label_26 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + ldr x9, [x2, #0x40] +KAI_ASM_LABEL(label_2) // Height 1: Column loop + cbz x10, label_3 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + b label_10 +KAI_ASM_LABEL(label_3) // Height 1: no bias + tbz x3, #0, label_9 + cmp x11, #0x8 + bge label_8 + tbz x11, #2, label_5 + ld1 { v20.4s }, [x9], #0x10 + tbz x11, #1, label_4 + ldr d21, [x9], #0x8 + mov x20, #0x18 + tbz x11, #0, label_7 + ld1 { v21.s }[2], [x9] + b label_7 +KAI_ASM_LABEL(label_4) // Height 1: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_7 + ldr s21, [x9, #0x0] + b label_7 +KAI_ASM_LABEL(label_5) // Height 1: Partial accumulate: partial_2_0 + tbz x11, #1, label_6 + ldr d20, [x9], #0x8 + mov x20, #0x8 + tbz x11, #0, label_7 + ld1 { v20.s }[2], [x9] + b label_7 +KAI_ASM_LABEL(label_6) // Height 1: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + mov x20, #0x0 +KAI_ASM_LABEL(label_7) // Height 1: Partial accumulate: Done + sub x9, x9, x20 + b label_10 +KAI_ASM_LABEL(label_8) // Height 1: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + b label_10 +KAI_ASM_LABEL(label_9) // Height 1: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 +KAI_ASM_LABEL(label_10) // Height 1: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_11) // Height 1: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_12 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + cbnz x28, label_13 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + b label_13 +KAI_ASM_LABEL(label_12) // Height 1: setup direct input + mov x26, x0 +KAI_ASM_LABEL(label_13) // Height 1: input setup done + cmp x27, #0x4 + blt label_16 + ldr q0, [x26, #0x0] + ldr q6, [x10, #0x0] + cmp x27, #0x8 + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_15 +KAI_ASM_LABEL(label_14) // Height 1: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v21.4s, v7.4s, v0.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + cmp x27, #0x8 + add x10, x10, #0x80 + prfm pldl1keep, [x26, #0x80] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + ldr q8, [x10, #0x20] + fmla v21.4s, v9.4s, v0.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + ldr q13, [x10, #0x70] + bge label_14 +KAI_ASM_LABEL(label_15) // Height 1: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v21.4s, v7.4s, v0.s[0] + add x26, x26, #0x10 + sub x27, x27, #0x4 + add x10, x10, #0x80 + prfm pldl1keep, [x26, #0x80] + fmla v20.4s, v8.4s, v0.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v21.4s, v13.4s, v0.s[3] +KAI_ASM_LABEL(label_16) // Height 1: Multiply loop: Main loop skip + cbz x27, label_18 +KAI_ASM_LABEL(label_17) // Height 1: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr q14, [x10, #0x0] + sub x27, x27, #0x1 + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v21.4s, v15.4s, v0.s[0] + cbnz x27, label_17 +KAI_ASM_LABEL(label_18) // Height 1: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_11 + prfm pstl1keep, [x9, #0x0] + tbz x3, #1, label_19 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s +KAI_ASM_LABEL(label_19) // Height 1: No activation + cmp x11, #0x8 + bge label_24 + tbz x11, #2, label_21 + st1 { v20.4s }, [x9], #0x10 + tbz x11, #1, label_20 + str d21, [x9], #0x8 + tbz x11, #0, label_23 + st1 { v21.s }[2], [x9] + b label_23 +KAI_ASM_LABEL(label_20) // Height 1: Partial direct writeback: partial_1_4 + tbz x11, #0, label_23 + str s21, [x9, #0x0] + b label_23 +KAI_ASM_LABEL(label_21) // Height 1: Partial direct writeback: partial_2_0 + tbz x11, #1, label_22 + str d20, [x9], #0x8 + tbz x11, #0, label_23 + st1 { v20.s }[2], [x9] + b label_23 +KAI_ASM_LABEL(label_22) // Height 1: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] +KAI_ASM_LABEL(label_23) // Height 1: Partial direct writeback: Done + b label_25 +KAI_ASM_LABEL(label_24) // Height 1: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 +KAI_ASM_LABEL(label_25) // Height 1: Writeback done + subs x11, x11, #0x8 + bgt label_2 + b label_152 +KAI_ASM_LABEL(label_26) // Height 2 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + ldr x9, [x2, #0x40] +KAI_ASM_LABEL(label_27) // Height 2: Column loop + cbz x10, label_28 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + mov v22.16b, v20.16b + mov v23.16b, v21.16b + b label_35 +KAI_ASM_LABEL(label_28) // Height 2: no bias + tbz x3, #0, label_34 + ldr x20, [x2, #0x28] + cmp x11, #0x8 + add x26, x9, x20, LSL #2 + bge label_33 + tbz x11, #2, label_30 + ld1 { v20.4s }, [x9], #0x10 + ld1 { v22.4s }, [x26], #0x10 + tbz x11, #1, label_29 + ldr d21, [x9], #0x8 + ldr d23, [x26], #0x8 + mov x20, #0x18 + tbz x11, #0, label_32 + ld1 { v21.s }[2], [x9] + ld1 { v23.s }[2], [x26] + b label_32 +KAI_ASM_LABEL(label_29) // Height 2: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_32 + ldr s21, [x9, #0x0] + ldr s23, [x26, #0x0] + b label_32 +KAI_ASM_LABEL(label_30) // Height 2: Partial accumulate: partial_2_0 + tbz x11, #1, label_31 + ldr d20, [x9], #0x8 + ldr d22, [x26], #0x8 + mov x20, #0x8 + tbz x11, #0, label_32 + ld1 { v20.s }[2], [x9] + ld1 { v22.s }[2], [x26] + b label_32 +KAI_ASM_LABEL(label_31) // Height 2: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + ldr s22, [x26, #0x0] + mov x20, #0x0 +KAI_ASM_LABEL(label_32) // Height 2: Partial accumulate: Done + sub x9, x9, x20 + b label_35 +KAI_ASM_LABEL(label_33) // Height 2: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + ldr q22, [x26, #0x0] + ldr q23, [x26, #0x10] + b label_35 +KAI_ASM_LABEL(label_34) // Height 2: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 + movi v22.16b, #0x0 + movi v23.16b, #0x0 +KAI_ASM_LABEL(label_35) // Height 2: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_36) // Height 2: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_37 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + ldr x25, [x20, #0x8] + cbnz x28, label_38 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + add x25, x25, x20, LSL #2 + b label_38 +KAI_ASM_LABEL(label_37) // Height 2: setup direct input + mov x26, x0 + add x25, x26, x21, LSL #2 +KAI_ASM_LABEL(label_38) // Height 2: input setup done + cmp x27, #0x4 + blt label_41 + ldr q0, [x26, #0x0] + ldr q1, [x25, #0x0] + cmp x27, #0x8 + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_40 +KAI_ASM_LABEL(label_39) // Height 2: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + add x25, x25, #0x10 + cmp x27, #0x8 + add x10, x10, #0x80 + prfm pldl1keep, [x26, #0x80] + prfm pldl1keep, [x25, #0x80] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + ldr q8, [x10, #0x20] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + fmla v23.4s, v13.4s, v1.s[3] + ldr q1, [x25, #0x0] + ldr q13, [x10, #0x70] + bge label_39 +KAI_ASM_LABEL(label_40) // Height 2: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + add x26, x26, #0x10 + add x25, x25, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + sub x27, x27, #0x4 + prfm pldl1keep, [x26, #0x80] + prfm pldl1keep, [x25, #0x80] + add x10, x10, #0x80 + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v21.4s, v13.4s, v0.s[3] + fmla v23.4s, v13.4s, v1.s[3] +KAI_ASM_LABEL(label_41) // Height 2: Multiply loop: Main loop skip + cbz x27, label_43 +KAI_ASM_LABEL(label_42) // Height 2: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr s1, [x25], #0x4 + sub x27, x27, #0x1 + ldr q14, [x10, #0x0] + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v22.4s, v14.4s, v1.s[0] + fmla v21.4s, v15.4s, v0.s[0] + fmla v23.4s, v15.4s, v1.s[0] + cbnz x27, label_42 +KAI_ASM_LABEL(label_43) // Height 2: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_36 + ldr x20, [x2, #0x28] + prfm pstl1keep, [x9, #0x0] + add x26, x9, x20, LSL #2 + prfm pstl1keep, [x26, #0x0] + tbz x3, #1, label_44 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmin v22.4s, v22.4s, v17.4s + fmin v23.4s, v23.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s + fmax v22.4s, v22.4s, v16.4s + fmax v23.4s, v23.4s, v16.4s +KAI_ASM_LABEL(label_44) // Height 2: No activation + cmp x11, #0x8 + bge label_49 + tbz x11, #2, label_46 + st1 { v20.4s }, [x9], #0x10 + st1 { v22.4s }, [x26], #0x10 + tbz x11, #1, label_45 + str d21, [x9], #0x8 + str d23, [x26], #0x8 + tbz x11, #0, label_48 + st1 { v21.s }[2], [x9] + st1 { v23.s }[2], [x26] + b label_48 +KAI_ASM_LABEL(label_45) // Height 2: Partial direct writeback: partial_1_4 + tbz x11, #0, label_48 + str s21, [x9, #0x0] + str s23, [x26, #0x0] + b label_48 +KAI_ASM_LABEL(label_46) // Height 2: Partial direct writeback: partial_2_0 + tbz x11, #1, label_47 + str d20, [x9], #0x8 + str d22, [x26], #0x8 + tbz x11, #0, label_48 + st1 { v20.s }[2], [x9] + st1 { v22.s }[2], [x26] + b label_48 +KAI_ASM_LABEL(label_47) // Height 2: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] + str s22, [x26, #0x0] +KAI_ASM_LABEL(label_48) // Height 2: Partial direct writeback: Done + b label_50 +KAI_ASM_LABEL(label_49) // Height 2: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 + str q22, [x26, #0x0] + str q23, [x26, #0x10] +KAI_ASM_LABEL(label_50) // Height 2: Writeback done + subs x11, x11, #0x8 + bgt label_27 + b label_152 +KAI_ASM_LABEL(label_51) // Height 3 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + ldr x9, [x2, #0x40] +KAI_ASM_LABEL(label_52) // Height 3: Column loop + cbz x10, label_53 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + b label_60 +KAI_ASM_LABEL(label_53) // Height 3: no bias + tbz x3, #0, label_59 + ldr x20, [x2, #0x28] + cmp x11, #0x8 + add x26, x9, x20, LSL #2 + add x25, x26, x20, LSL #2 + bge label_58 + tbz x11, #2, label_55 + ld1 { v20.4s }, [x9], #0x10 + ld1 { v22.4s }, [x26], #0x10 + ld1 { v24.4s }, [x25], #0x10 + tbz x11, #1, label_54 + ldr d21, [x9], #0x8 + ldr d23, [x26], #0x8 + mov x20, #0x18 + ldr d25, [x25], #0x8 + tbz x11, #0, label_57 + ld1 { v21.s }[2], [x9] + ld1 { v23.s }[2], [x26] + ld1 { v25.s }[2], [x25] + b label_57 +KAI_ASM_LABEL(label_54) // Height 3: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_57 + ldr s21, [x9, #0x0] + ldr s23, [x26, #0x0] + ldr s25, [x25, #0x0] + b label_57 +KAI_ASM_LABEL(label_55) // Height 3: Partial accumulate: partial_2_0 + tbz x11, #1, label_56 + ldr d20, [x9], #0x8 + ldr d22, [x26], #0x8 + mov x20, #0x8 + ldr d24, [x25], #0x8 + tbz x11, #0, label_57 + ld1 { v20.s }[2], [x9] + ld1 { v22.s }[2], [x26] + ld1 { v24.s }[2], [x25] + b label_57 +KAI_ASM_LABEL(label_56) // Height 3: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + ldr s22, [x26, #0x0] + mov x20, #0x0 + ldr s24, [x25, #0x0] +KAI_ASM_LABEL(label_57) // Height 3: Partial accumulate: Done + sub x9, x9, x20 + b label_60 +KAI_ASM_LABEL(label_58) // Height 3: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + ldr q22, [x26, #0x0] + ldr q23, [x26, #0x10] + ldr q24, [x25, #0x0] + ldr q25, [x25, #0x10] + b label_60 +KAI_ASM_LABEL(label_59) // Height 3: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 + movi v22.16b, #0x0 + movi v23.16b, #0x0 + movi v24.16b, #0x0 + movi v25.16b, #0x0 +KAI_ASM_LABEL(label_60) // Height 3: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_61) // Height 3: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_62 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + ldr x25, [x20, #0x8] + ldr x24, [x20, #0x10] + cbnz x28, label_63 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + add x25, x25, x20, LSL #2 + add x24, x24, x20, LSL #2 + b label_63 +KAI_ASM_LABEL(label_62) // Height 3: setup direct input + mov x26, x0 + add x25, x26, x21, LSL #2 + add x24, x25, x21, LSL #2 +KAI_ASM_LABEL(label_63) // Height 3: input setup done + cmp x27, #0x4 + blt label_66 + ldr q0, [x26, #0x0] + ldr q1, [x25, #0x0] + cmp x27, #0x8 + ldr q2, [x24, #0x0] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_65 +KAI_ASM_LABEL(label_64) // Height 3: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v21.4s, v7.4s, v0.s[0] + add x25, x25, #0x10 + add x24, x24, #0x10 + fmla v23.4s, v7.4s, v1.s[0] + fmla v25.4s, v7.4s, v2.s[0] + cmp x27, #0x8 + add x10, x10, #0x80 + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x26, #0x80] + prfm pldl1keep, [x25, #0x80] + fmla v24.4s, v8.4s, v2.s[1] + ldr q8, [x10, #0x20] + fmla v21.4s, v9.4s, v0.s[1] + prfm pldl1keep, [x24, #0x80] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + fmla v23.4s, v13.4s, v1.s[3] + ldr q1, [x25, #0x0] + fmla v25.4s, v13.4s, v2.s[3] + ldr q2, [x24, #0x0] + ldr q13, [x10, #0x70] + bge label_64 +KAI_ASM_LABEL(label_65) // Height 3: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + add x26, x26, #0x10 + add x25, x25, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v21.4s, v7.4s, v0.s[0] + add x24, x24, #0x10 + prfm pldl1keep, [x26, #0x80] + fmla v23.4s, v7.4s, v1.s[0] + fmla v25.4s, v7.4s, v2.s[0] + sub x27, x27, #0x4 + prfm pldl1keep, [x25, #0x80] + add x10, x10, #0x80 + prfm pldl1keep, [x24, #0x80] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + fmla v24.4s, v8.4s, v2.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v21.4s, v13.4s, v0.s[3] + fmla v23.4s, v13.4s, v1.s[3] + fmla v25.4s, v13.4s, v2.s[3] +KAI_ASM_LABEL(label_66) // Height 3: Multiply loop: Main loop skip + cbz x27, label_68 +KAI_ASM_LABEL(label_67) // Height 3: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr s1, [x25], #0x4 + sub x27, x27, #0x1 + ldr s2, [x24], #0x4 + ldr q14, [x10, #0x0] + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v22.4s, v14.4s, v1.s[0] + fmla v24.4s, v14.4s, v2.s[0] + fmla v21.4s, v15.4s, v0.s[0] + fmla v23.4s, v15.4s, v1.s[0] + fmla v25.4s, v15.4s, v2.s[0] + cbnz x27, label_67 +KAI_ASM_LABEL(label_68) // Height 3: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_61 + ldr x20, [x2, #0x28] + prfm pstl1keep, [x9, #0x0] + add x26, x9, x20, LSL #2 + prfm pstl1keep, [x26, #0x0] + add x25, x26, x20, LSL #2 + prfm pstl1keep, [x25, #0x0] + tbz x3, #1, label_69 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmin v22.4s, v22.4s, v17.4s + fmin v23.4s, v23.4s, v17.4s + fmin v24.4s, v24.4s, v17.4s + fmin v25.4s, v25.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s + fmax v22.4s, v22.4s, v16.4s + fmax v23.4s, v23.4s, v16.4s + fmax v24.4s, v24.4s, v16.4s + fmax v25.4s, v25.4s, v16.4s +KAI_ASM_LABEL(label_69) // Height 3: No activation + cmp x11, #0x8 + bge label_74 + tbz x11, #2, label_71 + st1 { v20.4s }, [x9], #0x10 + st1 { v22.4s }, [x26], #0x10 + st1 { v24.4s }, [x25], #0x10 + tbz x11, #1, label_70 + str d21, [x9], #0x8 + str d23, [x26], #0x8 + str d25, [x25], #0x8 + tbz x11, #0, label_73 + st1 { v21.s }[2], [x9] + st1 { v23.s }[2], [x26] + st1 { v25.s }[2], [x25] + b label_73 +KAI_ASM_LABEL(label_70) // Height 3: Partial direct writeback: partial_1_4 + tbz x11, #0, label_73 + str s21, [x9, #0x0] + str s23, [x26, #0x0] + str s25, [x25, #0x0] + b label_73 +KAI_ASM_LABEL(label_71) // Height 3: Partial direct writeback: partial_2_0 + tbz x11, #1, label_72 + str d20, [x9], #0x8 + str d22, [x26], #0x8 + str d24, [x25], #0x8 + tbz x11, #0, label_73 + st1 { v20.s }[2], [x9] + st1 { v22.s }[2], [x26] + st1 { v24.s }[2], [x25] + b label_73 +KAI_ASM_LABEL(label_72) // Height 3: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] + str s22, [x26, #0x0] + str s24, [x25, #0x0] +KAI_ASM_LABEL(label_73) // Height 3: Partial direct writeback: Done + b label_75 +KAI_ASM_LABEL(label_74) // Height 3: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 + str q22, [x26, #0x0] + str q23, [x26, #0x10] + str q24, [x25, #0x0] + str q25, [x25, #0x10] +KAI_ASM_LABEL(label_75) // Height 3: Writeback done + subs x11, x11, #0x8 + bgt label_52 + b label_152 +KAI_ASM_LABEL(label_76) // Height 4 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + ldr x9, [x2, #0x40] +KAI_ASM_LABEL(label_77) // Height 4: Column loop + cbz x10, label_78 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v26.16b, v20.16b + mov v27.16b, v21.16b + b label_85 +KAI_ASM_LABEL(label_78) // Height 4: no bias + tbz x3, #0, label_84 + ldr x20, [x2, #0x28] + cmp x11, #0x8 + add x26, x9, x20, LSL #2 + add x25, x26, x20, LSL #2 + add x24, x25, x20, LSL #2 + bge label_83 + tbz x11, #2, label_80 + ld1 { v20.4s }, [x9], #0x10 + ld1 { v22.4s }, [x26], #0x10 + ld1 { v24.4s }, [x25], #0x10 + ld1 { v26.4s }, [x24], #0x10 + tbz x11, #1, label_79 + ldr d21, [x9], #0x8 + ldr d23, [x26], #0x8 + mov x20, #0x18 + ldr d25, [x25], #0x8 + ldr d27, [x24], #0x8 + tbz x11, #0, label_82 + ld1 { v21.s }[2], [x9] + ld1 { v23.s }[2], [x26] + ld1 { v25.s }[2], [x25] + ld1 { v27.s }[2], [x24] + b label_82 +KAI_ASM_LABEL(label_79) // Height 4: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_82 + ldr s21, [x9, #0x0] + ldr s23, [x26, #0x0] + ldr s25, [x25, #0x0] + ldr s27, [x24, #0x0] + b label_82 +KAI_ASM_LABEL(label_80) // Height 4: Partial accumulate: partial_2_0 + tbz x11, #1, label_81 + ldr d20, [x9], #0x8 + ldr d22, [x26], #0x8 + mov x20, #0x8 + ldr d24, [x25], #0x8 + ldr d26, [x24], #0x8 + tbz x11, #0, label_82 + ld1 { v20.s }[2], [x9] + ld1 { v22.s }[2], [x26] + ld1 { v24.s }[2], [x25] + ld1 { v26.s }[2], [x24] + b label_82 +KAI_ASM_LABEL(label_81) // Height 4: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + ldr s22, [x26, #0x0] + mov x20, #0x0 + ldr s24, [x25, #0x0] + ldr s26, [x24, #0x0] +KAI_ASM_LABEL(label_82) // Height 4: Partial accumulate: Done + sub x9, x9, x20 + b label_85 +KAI_ASM_LABEL(label_83) // Height 4: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + ldr q22, [x26, #0x0] + ldr q23, [x26, #0x10] + ldr q24, [x25, #0x0] + ldr q25, [x25, #0x10] + ldr q26, [x24, #0x0] + ldr q27, [x24, #0x10] + b label_85 +KAI_ASM_LABEL(label_84) // Height 4: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 + movi v22.16b, #0x0 + movi v23.16b, #0x0 + movi v24.16b, #0x0 + movi v25.16b, #0x0 + movi v26.16b, #0x0 + movi v27.16b, #0x0 +KAI_ASM_LABEL(label_85) // Height 4: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_86) // Height 4: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_87 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + ldr x25, [x20, #0x8] + ldr x24, [x20, #0x10] + ldr x23, [x20, #0x18] + cbnz x28, label_88 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + add x25, x25, x20, LSL #2 + add x24, x24, x20, LSL #2 + add x23, x23, x20, LSL #2 + b label_88 +KAI_ASM_LABEL(label_87) // Height 4: setup direct input + mov x26, x0 + add x25, x26, x21, LSL #2 + add x24, x25, x21, LSL #2 + add x23, x24, x21, LSL #2 +KAI_ASM_LABEL(label_88) // Height 4: input setup done + cmp x27, #0x4 + blt label_91 + ldr q0, [x26, #0x0] + ldr q1, [x25, #0x0] + cmp x27, #0x8 + ldr q2, [x24, #0x0] + ldr q3, [x23, #0x0] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_90 +KAI_ASM_LABEL(label_89) // Height 4: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x25, x25, #0x10 + add x24, x24, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + add x23, x23, #0x10 + cmp x27, #0x8 + fmla v25.4s, v7.4s, v2.s[0] + fmla v27.4s, v7.4s, v3.s[0] + add x10, x10, #0x80 + prfm pldl1keep, [x26, #0x80] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + ldr q8, [x10, #0x20] + prfm pldl1keep, [x25, #0x80] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + prfm pldl1keep, [x24, #0x80] + prfm pldl1keep, [x23, #0x80] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + fmla v23.4s, v13.4s, v1.s[3] + ldr q1, [x25, #0x0] + fmla v25.4s, v13.4s, v2.s[3] + ldr q2, [x24, #0x0] + fmla v27.4s, v13.4s, v3.s[3] + ldr q3, [x23, #0x0] + ldr q13, [x10, #0x70] + bge label_89 +KAI_ASM_LABEL(label_90) // Height 4: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + add x26, x26, #0x10 + add x25, x25, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x24, x24, #0x10 + add x23, x23, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + sub x27, x27, #0x4 + prfm pldl1keep, [x26, #0x80] + fmla v25.4s, v7.4s, v2.s[0] + fmla v27.4s, v7.4s, v3.s[0] + prfm pldl1keep, [x25, #0x80] + prfm pldl1keep, [x24, #0x80] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x23, #0x80] + add x10, x10, #0x80 + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + fmla v21.4s, v13.4s, v0.s[3] + fmla v23.4s, v13.4s, v1.s[3] + fmla v25.4s, v13.4s, v2.s[3] + fmla v27.4s, v13.4s, v3.s[3] +KAI_ASM_LABEL(label_91) // Height 4: Multiply loop: Main loop skip + cbz x27, label_93 +KAI_ASM_LABEL(label_92) // Height 4: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr s1, [x25], #0x4 + sub x27, x27, #0x1 + ldr s2, [x24], #0x4 + ldr s3, [x23], #0x4 + ldr q14, [x10, #0x0] + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v22.4s, v14.4s, v1.s[0] + fmla v24.4s, v14.4s, v2.s[0] + fmla v26.4s, v14.4s, v3.s[0] + fmla v21.4s, v15.4s, v0.s[0] + fmla v23.4s, v15.4s, v1.s[0] + fmla v25.4s, v15.4s, v2.s[0] + fmla v27.4s, v15.4s, v3.s[0] + cbnz x27, label_92 +KAI_ASM_LABEL(label_93) // Height 4: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_86 + ldr x20, [x2, #0x28] + prfm pstl1keep, [x9, #0x0] + add x26, x9, x20, LSL #2 + prfm pstl1keep, [x26, #0x0] + add x25, x26, x20, LSL #2 + prfm pstl1keep, [x25, #0x0] + add x24, x25, x20, LSL #2 + prfm pstl1keep, [x24, #0x0] + tbz x3, #1, label_94 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmin v22.4s, v22.4s, v17.4s + fmin v23.4s, v23.4s, v17.4s + fmin v24.4s, v24.4s, v17.4s + fmin v25.4s, v25.4s, v17.4s + fmin v26.4s, v26.4s, v17.4s + fmin v27.4s, v27.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s + fmax v22.4s, v22.4s, v16.4s + fmax v23.4s, v23.4s, v16.4s + fmax v24.4s, v24.4s, v16.4s + fmax v25.4s, v25.4s, v16.4s + fmax v26.4s, v26.4s, v16.4s + fmax v27.4s, v27.4s, v16.4s +KAI_ASM_LABEL(label_94) // Height 4: No activation + cmp x11, #0x8 + bge label_99 + tbz x11, #2, label_96 + st1 { v20.4s }, [x9], #0x10 + st1 { v22.4s }, [x26], #0x10 + st1 { v24.4s }, [x25], #0x10 + st1 { v26.4s }, [x24], #0x10 + tbz x11, #1, label_95 + str d21, [x9], #0x8 + str d23, [x26], #0x8 + str d25, [x25], #0x8 + str d27, [x24], #0x8 + tbz x11, #0, label_98 + st1 { v21.s }[2], [x9] + st1 { v23.s }[2], [x26] + st1 { v25.s }[2], [x25] + st1 { v27.s }[2], [x24] + b label_98 +KAI_ASM_LABEL(label_95) // Height 4: Partial direct writeback: partial_1_4 + tbz x11, #0, label_98 + str s21, [x9, #0x0] + str s23, [x26, #0x0] + str s25, [x25, #0x0] + str s27, [x24, #0x0] + b label_98 +KAI_ASM_LABEL(label_96) // Height 4: Partial direct writeback: partial_2_0 + tbz x11, #1, label_97 + str d20, [x9], #0x8 + str d22, [x26], #0x8 + str d24, [x25], #0x8 + str d26, [x24], #0x8 + tbz x11, #0, label_98 + st1 { v20.s }[2], [x9] + st1 { v22.s }[2], [x26] + st1 { v24.s }[2], [x25] + st1 { v26.s }[2], [x24] + b label_98 +KAI_ASM_LABEL(label_97) // Height 4: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] + str s22, [x26, #0x0] + str s24, [x25, #0x0] + str s26, [x24, #0x0] +KAI_ASM_LABEL(label_98) // Height 4: Partial direct writeback: Done + b label_100 +KAI_ASM_LABEL(label_99) // Height 4: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 + str q22, [x26, #0x0] + str q23, [x26, #0x10] + str q24, [x25, #0x0] + str q25, [x25, #0x10] + str q26, [x24, #0x0] + str q27, [x24, #0x10] +KAI_ASM_LABEL(label_100) // Height 4: Writeback done + subs x11, x11, #0x8 + bgt label_77 + b label_152 +KAI_ASM_LABEL(label_101) // Height 5 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + ldr x9, [x2, #0x40] +KAI_ASM_LABEL(label_102) // Height 5: Column loop + cbz x10, label_103 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v26.16b, v20.16b + mov v27.16b, v21.16b + mov v28.16b, v20.16b + mov v29.16b, v21.16b + b label_110 +KAI_ASM_LABEL(label_103) // Height 5: no bias + tbz x3, #0, label_109 + ldr x20, [x2, #0x28] + cmp x11, #0x8 + add x26, x9, x20, LSL #2 + add x25, x26, x20, LSL #2 + add x24, x25, x20, LSL #2 + add x23, x24, x20, LSL #2 + bge label_108 + tbz x11, #2, label_105 + ld1 { v20.4s }, [x9], #0x10 + ld1 { v22.4s }, [x26], #0x10 + ld1 { v24.4s }, [x25], #0x10 + ld1 { v26.4s }, [x24], #0x10 + ld1 { v28.4s }, [x23], #0x10 + tbz x11, #1, label_104 + ldr d21, [x9], #0x8 + ldr d23, [x26], #0x8 + mov x20, #0x18 + ldr d25, [x25], #0x8 + ldr d27, [x24], #0x8 + ldr d29, [x23], #0x8 + tbz x11, #0, label_107 + ld1 { v21.s }[2], [x9] + ld1 { v23.s }[2], [x26] + ld1 { v25.s }[2], [x25] + ld1 { v27.s }[2], [x24] + ld1 { v29.s }[2], [x23] + b label_107 +KAI_ASM_LABEL(label_104) // Height 5: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_107 + ldr s21, [x9, #0x0] + ldr s23, [x26, #0x0] + ldr s25, [x25, #0x0] + ldr s27, [x24, #0x0] + ldr s29, [x23, #0x0] + b label_107 +KAI_ASM_LABEL(label_105) // Height 5: Partial accumulate: partial_2_0 + tbz x11, #1, label_106 + ldr d20, [x9], #0x8 + ldr d22, [x26], #0x8 + mov x20, #0x8 + ldr d24, [x25], #0x8 + ldr d26, [x24], #0x8 + ldr d28, [x23], #0x8 + tbz x11, #0, label_107 + ld1 { v20.s }[2], [x9] + ld1 { v22.s }[2], [x26] + ld1 { v24.s }[2], [x25] + ld1 { v26.s }[2], [x24] + ld1 { v28.s }[2], [x23] + b label_107 +KAI_ASM_LABEL(label_106) // Height 5: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + ldr s22, [x26, #0x0] + mov x20, #0x0 + ldr s24, [x25, #0x0] + ldr s26, [x24, #0x0] + ldr s28, [x23, #0x0] +KAI_ASM_LABEL(label_107) // Height 5: Partial accumulate: Done + sub x9, x9, x20 + b label_110 +KAI_ASM_LABEL(label_108) // Height 5: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + ldr q22, [x26, #0x0] + ldr q23, [x26, #0x10] + ldr q24, [x25, #0x0] + ldr q25, [x25, #0x10] + ldr q26, [x24, #0x0] + ldr q27, [x24, #0x10] + ldr q28, [x23, #0x0] + ldr q29, [x23, #0x10] + b label_110 +KAI_ASM_LABEL(label_109) // Height 5: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 + movi v22.16b, #0x0 + movi v23.16b, #0x0 + movi v24.16b, #0x0 + movi v25.16b, #0x0 + movi v26.16b, #0x0 + movi v27.16b, #0x0 + movi v28.16b, #0x0 + movi v29.16b, #0x0 +KAI_ASM_LABEL(label_110) // Height 5: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_111) // Height 5: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_112 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + ldr x25, [x20, #0x8] + ldr x24, [x20, #0x10] + ldr x23, [x20, #0x18] + ldr x22, [x20, #0x20] + cbnz x28, label_113 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + add x25, x25, x20, LSL #2 + add x24, x24, x20, LSL #2 + add x23, x23, x20, LSL #2 + add x22, x22, x20, LSL #2 + b label_113 +KAI_ASM_LABEL(label_112) // Height 5: setup direct input + mov x26, x0 + add x25, x26, x21, LSL #2 + add x24, x25, x21, LSL #2 + add x23, x24, x21, LSL #2 + add x22, x23, x21, LSL #2 +KAI_ASM_LABEL(label_113) // Height 5: input setup done + cmp x27, #0x4 + blt label_116 + ldr q0, [x26, #0x0] + ldr q1, [x25, #0x0] + cmp x27, #0x8 + ldr q2, [x24, #0x0] + ldr q3, [x23, #0x0] + ldr q4, [x22, #0x0] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_115 +KAI_ASM_LABEL(label_114) // Height 5: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x25, x25, #0x10 + add x24, x24, #0x10 + fmla v28.4s, v6.4s, v4.s[0] + fmla v21.4s, v7.4s, v0.s[0] + add x23, x23, #0x10 + add x22, x22, #0x10 + fmla v23.4s, v7.4s, v1.s[0] + fmla v25.4s, v7.4s, v2.s[0] + cmp x27, #0x8 + add x10, x10, #0x80 + ldr q6, [x10, #0x0] + fmla v27.4s, v7.4s, v3.s[0] + fmla v29.4s, v7.4s, v4.s[0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x26, #0x80] + prfm pldl1keep, [x25, #0x80] + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + prfm pldl1keep, [x24, #0x80] + prfm pldl1keep, [x23, #0x80] + fmla v28.4s, v8.4s, v4.s[1] + ldr q8, [x10, #0x20] + fmla v21.4s, v9.4s, v0.s[1] + prfm pldl1keep, [x22, #0x80] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + fmla v29.4s, v9.4s, v4.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + fmla v28.4s, v10.4s, v4.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + fmla v29.4s, v11.4s, v4.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + fmla v28.4s, v12.4s, v4.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + fmla v23.4s, v13.4s, v1.s[3] + ldr q1, [x25, #0x0] + fmla v25.4s, v13.4s, v2.s[3] + ldr q2, [x24, #0x0] + fmla v27.4s, v13.4s, v3.s[3] + ldr q3, [x23, #0x0] + fmla v29.4s, v13.4s, v4.s[3] + ldr q4, [x22, #0x0] + ldr q13, [x10, #0x70] + bge label_114 +KAI_ASM_LABEL(label_115) // Height 5: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + add x26, x26, #0x10 + add x25, x25, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x24, x24, #0x10 + add x23, x23, #0x10 + fmla v28.4s, v6.4s, v4.s[0] + fmla v21.4s, v7.4s, v0.s[0] + add x22, x22, #0x10 + sub x27, x27, #0x4 + fmla v23.4s, v7.4s, v1.s[0] + fmla v25.4s, v7.4s, v2.s[0] + prfm pldl1keep, [x26, #0x80] + prfm pldl1keep, [x25, #0x80] + fmla v27.4s, v7.4s, v3.s[0] + fmla v29.4s, v7.4s, v4.s[0] + prfm pldl1keep, [x24, #0x80] + prfm pldl1keep, [x23, #0x80] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x22, #0x80] + add x10, x10, #0x80 + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + fmla v28.4s, v8.4s, v4.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + fmla v29.4s, v9.4s, v4.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + fmla v28.4s, v10.4s, v4.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + fmla v29.4s, v11.4s, v4.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + fmla v28.4s, v12.4s, v4.s[3] + fmla v21.4s, v13.4s, v0.s[3] + fmla v23.4s, v13.4s, v1.s[3] + fmla v25.4s, v13.4s, v2.s[3] + fmla v27.4s, v13.4s, v3.s[3] + fmla v29.4s, v13.4s, v4.s[3] +KAI_ASM_LABEL(label_116) // Height 5: Multiply loop: Main loop skip + cbz x27, label_118 +KAI_ASM_LABEL(label_117) // Height 5: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr s1, [x25], #0x4 + sub x27, x27, #0x1 + ldr s2, [x24], #0x4 + ldr s3, [x23], #0x4 + ldr s4, [x22], #0x4 + ldr q14, [x10, #0x0] + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v22.4s, v14.4s, v1.s[0] + fmla v24.4s, v14.4s, v2.s[0] + fmla v26.4s, v14.4s, v3.s[0] + fmla v28.4s, v14.4s, v4.s[0] + fmla v21.4s, v15.4s, v0.s[0] + fmla v23.4s, v15.4s, v1.s[0] + fmla v25.4s, v15.4s, v2.s[0] + fmla v27.4s, v15.4s, v3.s[0] + fmla v29.4s, v15.4s, v4.s[0] + cbnz x27, label_117 +KAI_ASM_LABEL(label_118) // Height 5: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_111 + ldr x20, [x2, #0x28] + prfm pstl1keep, [x9, #0x0] + add x26, x9, x20, LSL #2 + prfm pstl1keep, [x26, #0x0] + add x25, x26, x20, LSL #2 + prfm pstl1keep, [x25, #0x0] + add x24, x25, x20, LSL #2 + prfm pstl1keep, [x24, #0x0] + add x23, x24, x20, LSL #2 + prfm pstl1keep, [x23, #0x0] + tbz x3, #1, label_119 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmin v22.4s, v22.4s, v17.4s + fmin v23.4s, v23.4s, v17.4s + fmin v24.4s, v24.4s, v17.4s + fmin v25.4s, v25.4s, v17.4s + fmin v26.4s, v26.4s, v17.4s + fmin v27.4s, v27.4s, v17.4s + fmin v28.4s, v28.4s, v17.4s + fmin v29.4s, v29.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s + fmax v22.4s, v22.4s, v16.4s + fmax v23.4s, v23.4s, v16.4s + fmax v24.4s, v24.4s, v16.4s + fmax v25.4s, v25.4s, v16.4s + fmax v26.4s, v26.4s, v16.4s + fmax v27.4s, v27.4s, v16.4s + fmax v28.4s, v28.4s, v16.4s + fmax v29.4s, v29.4s, v16.4s +KAI_ASM_LABEL(label_119) // Height 5: No activation + cmp x11, #0x8 + bge label_124 + tbz x11, #2, label_121 + st1 { v20.4s }, [x9], #0x10 + st1 { v22.4s }, [x26], #0x10 + st1 { v24.4s }, [x25], #0x10 + st1 { v26.4s }, [x24], #0x10 + st1 { v28.4s }, [x23], #0x10 + tbz x11, #1, label_120 + str d21, [x9], #0x8 + str d23, [x26], #0x8 + str d25, [x25], #0x8 + str d27, [x24], #0x8 + str d29, [x23], #0x8 + tbz x11, #0, label_123 + st1 { v21.s }[2], [x9] + st1 { v23.s }[2], [x26] + st1 { v25.s }[2], [x25] + st1 { v27.s }[2], [x24] + st1 { v29.s }[2], [x23] + b label_123 +KAI_ASM_LABEL(label_120) // Height 5: Partial direct writeback: partial_1_4 + tbz x11, #0, label_123 + str s21, [x9, #0x0] + str s23, [x26, #0x0] + str s25, [x25, #0x0] + str s27, [x24, #0x0] + str s29, [x23, #0x0] + b label_123 +KAI_ASM_LABEL(label_121) // Height 5: Partial direct writeback: partial_2_0 + tbz x11, #1, label_122 + str d20, [x9], #0x8 + str d22, [x26], #0x8 + str d24, [x25], #0x8 + str d26, [x24], #0x8 + str d28, [x23], #0x8 + tbz x11, #0, label_123 + st1 { v20.s }[2], [x9] + st1 { v22.s }[2], [x26] + st1 { v24.s }[2], [x25] + st1 { v26.s }[2], [x24] + st1 { v28.s }[2], [x23] + b label_123 +KAI_ASM_LABEL(label_122) // Height 5: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] + str s22, [x26, #0x0] + str s24, [x25, #0x0] + str s26, [x24, #0x0] + str s28, [x23, #0x0] +KAI_ASM_LABEL(label_123) // Height 5: Partial direct writeback: Done + b label_125 +KAI_ASM_LABEL(label_124) // Height 5: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 + str q22, [x26, #0x0] + str q23, [x26, #0x10] + str q24, [x25, #0x0] + str q25, [x25, #0x10] + str q26, [x24, #0x0] + str q27, [x24, #0x10] + str q28, [x23, #0x0] + str q29, [x23, #0x10] +KAI_ASM_LABEL(label_125) // Height 5: Writeback done + subs x11, x11, #0x8 + bgt label_102 + b label_152 +KAI_ASM_LABEL(label_126) // Height 6 + ldr x21, [x2, #0x28] + ldr x9, [x2, #0x40] + mov x20, #0x18 + ldr x11, [x2, #0x18] + ldr x10, [x2, #0x20] + madd x20, x21, x20, x9 + str x20, [x2, #0x40] +KAI_ASM_LABEL(label_127) // Height 6: Column loop + cbz x10, label_128 + ldr q20, [x10, #0x0] + ldr q21, [x10, #0x10] + add x10, x10, #0x20 + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v26.16b, v20.16b + mov v27.16b, v21.16b + mov v28.16b, v20.16b + mov v29.16b, v21.16b + mov v30.16b, v20.16b + mov v31.16b, v21.16b + b label_135 +KAI_ASM_LABEL(label_128) // Height 6: no bias + tbz x3, #0, label_134 + ldr x20, [x2, #0x28] + cmp x11, #0x8 + add x26, x9, x20, LSL #2 + add x25, x26, x20, LSL #2 + add x24, x25, x20, LSL #2 + add x23, x24, x20, LSL #2 + add x22, x23, x20, LSL #2 + bge label_133 + tbz x11, #2, label_130 + ld1 { v20.4s }, [x9], #0x10 + ld1 { v22.4s }, [x26], #0x10 + ld1 { v24.4s }, [x25], #0x10 + ld1 { v26.4s }, [x24], #0x10 + ld1 { v28.4s }, [x23], #0x10 + ld1 { v30.4s }, [x22], #0x10 + tbz x11, #1, label_129 + ldr d21, [x9], #0x8 + ldr d23, [x26], #0x8 + mov x20, #0x18 + ldr d25, [x25], #0x8 + ldr d27, [x24], #0x8 + ldr d29, [x23], #0x8 + ldr d31, [x22], #0x8 + tbz x11, #0, label_132 + ld1 { v21.s }[2], [x9] + ld1 { v23.s }[2], [x26] + ld1 { v25.s }[2], [x25] + ld1 { v27.s }[2], [x24] + ld1 { v29.s }[2], [x23] + ld1 { v31.s }[2], [x22] + b label_132 +KAI_ASM_LABEL(label_129) // Height 6: Partial accumulate: partial_1_4 + mov x20, #0x10 + tbz x11, #0, label_132 + ldr s21, [x9, #0x0] + ldr s23, [x26, #0x0] + ldr s25, [x25, #0x0] + ldr s27, [x24, #0x0] + ldr s29, [x23, #0x0] + ldr s31, [x22, #0x0] + b label_132 +KAI_ASM_LABEL(label_130) // Height 6: Partial accumulate: partial_2_0 + tbz x11, #1, label_131 + ldr d20, [x9], #0x8 + ldr d22, [x26], #0x8 + mov x20, #0x8 + ldr d24, [x25], #0x8 + ldr d26, [x24], #0x8 + ldr d28, [x23], #0x8 + ldr d30, [x22], #0x8 + tbz x11, #0, label_132 + ld1 { v20.s }[2], [x9] + ld1 { v22.s }[2], [x26] + ld1 { v24.s }[2], [x25] + ld1 { v26.s }[2], [x24] + ld1 { v28.s }[2], [x23] + ld1 { v30.s }[2], [x22] + b label_132 +KAI_ASM_LABEL(label_131) // Height 6: Partial accumulate: partial_1_0 + ldr s20, [x9, #0x0] + ldr s22, [x26, #0x0] + mov x20, #0x0 + ldr s24, [x25, #0x0] + ldr s26, [x24, #0x0] + ldr s28, [x23, #0x0] + ldr s30, [x22, #0x0] +KAI_ASM_LABEL(label_132) // Height 6: Partial accumulate: Done + sub x9, x9, x20 + b label_135 +KAI_ASM_LABEL(label_133) // Height 6: full accumulate + ldr q20, [x9, #0x0] + ldr q21, [x9, #0x10] + ldr q22, [x26, #0x0] + ldr q23, [x26, #0x10] + ldr q24, [x25, #0x0] + ldr q25, [x25, #0x10] + ldr q26, [x24, #0x0] + ldr q27, [x24, #0x10] + ldr q28, [x23, #0x0] + ldr q29, [x23, #0x10] + ldr q30, [x22, #0x0] + ldr q31, [x22, #0x10] + b label_135 +KAI_ASM_LABEL(label_134) // Height 6: no accumulate + movi v20.16b, #0x0 + movi v21.16b, #0x0 + movi v22.16b, #0x0 + movi v23.16b, #0x0 + movi v24.16b, #0x0 + movi v25.16b, #0x0 + movi v26.16b, #0x0 + movi v27.16b, #0x0 + movi v28.16b, #0x0 + movi v29.16b, #0x0 + movi v30.16b, #0x0 + movi v31.16b, #0x0 +KAI_ASM_LABEL(label_135) // Height 6: setup done + mov x28, #0x0 +KAI_ASM_LABEL(label_136) // Height 6: String loop + ldr x20, [x2, #0x10] + ldr x21, [x2, #0x38] + ldr w27, [x20, x28, LSL #0x2] + tbz x3, #3, label_137 + ldr x20, [x0, x28, LSL #0x3] + add x20, x20, x21, LSL #3 + ldr x26, [x20, #0x0] + ldr x25, [x20, #0x8] + ldr x24, [x20, #0x10] + ldr x23, [x20, #0x18] + ldr x22, [x20, #0x20] + ldr x21, [x20, #0x28] + cbnz x28, label_138 + ldr x20, [x2, #0x30] + add x26, x26, x20, LSL #2 + add x25, x25, x20, LSL #2 + add x24, x24, x20, LSL #2 + add x23, x23, x20, LSL #2 + add x22, x22, x20, LSL #2 + add x21, x21, x20, LSL #2 + b label_138 +KAI_ASM_LABEL(label_137) // Height 6: setup direct input + mov x26, x0 + add x25, x26, x21, LSL #2 + add x24, x25, x21, LSL #2 + add x23, x24, x21, LSL #2 + add x22, x23, x21, LSL #2 + add x21, x22, x21, LSL #2 +KAI_ASM_LABEL(label_138) // Height 6: input setup done + cmp x27, #0x4 + blt label_141 + ldr q0, [x26, #0x0] + ldr q1, [x25, #0x0] + cmp x27, #0x8 + ldr q2, [x24, #0x0] + ldr q3, [x23, #0x0] + ldr q4, [x22, #0x0] + ldr q5, [x21, #0x0] + ldr q6, [x10, #0x0] + ldr q7, [x10, #0x10] + ldr q8, [x10, #0x20] + ldr q9, [x10, #0x30] + ldr q10, [x10, #0x40] + ldr q11, [x10, #0x50] + ldr q12, [x10, #0x60] + ldr q13, [x10, #0x70] + blt label_140 +KAI_ASM_LABEL(label_139) // Height 6: Multiply loop: Main loop head + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + sub x27, x27, #0x4 + add x26, x26, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x25, x25, #0x10 + add x24, x24, #0x10 + fmla v28.4s, v6.4s, v4.s[0] + fmla v30.4s, v6.4s, v5.s[0] + add x23, x23, #0x10 + add x22, x22, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + add x21, x21, #0x10 + cmp x27, #0x8 + fmla v25.4s, v7.4s, v2.s[0] + fmla v27.4s, v7.4s, v3.s[0] + add x10, x10, #0x80 + prfm pldl1keep, [x26, #0x80] + ldr q6, [x10, #0x0] + fmla v29.4s, v7.4s, v4.s[0] + fmla v31.4s, v7.4s, v5.s[0] + ldr q7, [x10, #0x10] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x25, #0x80] + prfm pldl1keep, [x24, #0x80] + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + prfm pldl1keep, [x23, #0x80] + prfm pldl1keep, [x22, #0x80] + fmla v28.4s, v8.4s, v4.s[1] + fmla v30.4s, v8.4s, v5.s[1] + ldr q8, [x10, #0x20] + prfm pldl1keep, [x21, #0x80] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + fmla v29.4s, v9.4s, v4.s[1] + fmla v31.4s, v9.4s, v5.s[1] + ldr q9, [x10, #0x30] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + fmla v28.4s, v10.4s, v4.s[2] + fmla v30.4s, v10.4s, v5.s[2] + ldr q10, [x10, #0x40] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + fmla v29.4s, v11.4s, v4.s[2] + fmla v31.4s, v11.4s, v5.s[2] + ldr q11, [x10, #0x50] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + fmla v28.4s, v12.4s, v4.s[3] + fmla v30.4s, v12.4s, v5.s[3] + ldr q12, [x10, #0x60] + fmla v21.4s, v13.4s, v0.s[3] + ldr q0, [x26, #0x0] + fmla v23.4s, v13.4s, v1.s[3] + ldr q1, [x25, #0x0] + fmla v25.4s, v13.4s, v2.s[3] + ldr q2, [x24, #0x0] + fmla v27.4s, v13.4s, v3.s[3] + ldr q3, [x23, #0x0] + fmla v29.4s, v13.4s, v4.s[3] + ldr q4, [x22, #0x0] + fmla v31.4s, v13.4s, v5.s[3] + ldr q5, [x21, #0x0] + ldr q13, [x10, #0x70] + bge label_139 +KAI_ASM_LABEL(label_140) // Height 6: Multiply loop: Single iteration only + fmla v20.4s, v6.4s, v0.s[0] + fmla v22.4s, v6.4s, v1.s[0] + add x26, x26, #0x10 + add x25, x25, #0x10 + fmla v24.4s, v6.4s, v2.s[0] + fmla v26.4s, v6.4s, v3.s[0] + add x24, x24, #0x10 + add x23, x23, #0x10 + fmla v28.4s, v6.4s, v4.s[0] + fmla v30.4s, v6.4s, v5.s[0] + add x22, x22, #0x10 + add x21, x21, #0x10 + fmla v21.4s, v7.4s, v0.s[0] + fmla v23.4s, v7.4s, v1.s[0] + sub x27, x27, #0x4 + prfm pldl1keep, [x26, #0x80] + fmla v25.4s, v7.4s, v2.s[0] + fmla v27.4s, v7.4s, v3.s[0] + prfm pldl1keep, [x25, #0x80] + prfm pldl1keep, [x24, #0x80] + fmla v29.4s, v7.4s, v4.s[0] + fmla v31.4s, v7.4s, v5.s[0] + prfm pldl1keep, [x23, #0x80] + prfm pldl1keep, [x22, #0x80] + fmla v20.4s, v8.4s, v0.s[1] + fmla v22.4s, v8.4s, v1.s[1] + prfm pldl1keep, [x21, #0x80] + add x10, x10, #0x80 + fmla v24.4s, v8.4s, v2.s[1] + fmla v26.4s, v8.4s, v3.s[1] + fmla v28.4s, v8.4s, v4.s[1] + fmla v30.4s, v8.4s, v5.s[1] + fmla v21.4s, v9.4s, v0.s[1] + fmla v23.4s, v9.4s, v1.s[1] + fmla v25.4s, v9.4s, v2.s[1] + fmla v27.4s, v9.4s, v3.s[1] + fmla v29.4s, v9.4s, v4.s[1] + fmla v31.4s, v9.4s, v5.s[1] + fmla v20.4s, v10.4s, v0.s[2] + fmla v22.4s, v10.4s, v1.s[2] + fmla v24.4s, v10.4s, v2.s[2] + fmla v26.4s, v10.4s, v3.s[2] + fmla v28.4s, v10.4s, v4.s[2] + fmla v30.4s, v10.4s, v5.s[2] + fmla v21.4s, v11.4s, v0.s[2] + fmla v23.4s, v11.4s, v1.s[2] + fmla v25.4s, v11.4s, v2.s[2] + fmla v27.4s, v11.4s, v3.s[2] + fmla v29.4s, v11.4s, v4.s[2] + fmla v31.4s, v11.4s, v5.s[2] + fmla v20.4s, v12.4s, v0.s[3] + fmla v22.4s, v12.4s, v1.s[3] + fmla v24.4s, v12.4s, v2.s[3] + fmla v26.4s, v12.4s, v3.s[3] + fmla v28.4s, v12.4s, v4.s[3] + fmla v30.4s, v12.4s, v5.s[3] + fmla v21.4s, v13.4s, v0.s[3] + fmla v23.4s, v13.4s, v1.s[3] + fmla v25.4s, v13.4s, v2.s[3] + fmla v27.4s, v13.4s, v3.s[3] + fmla v29.4s, v13.4s, v4.s[3] + fmla v31.4s, v13.4s, v5.s[3] +KAI_ASM_LABEL(label_141) // Height 6: Multiply loop: Main loop skip + cbz x27, label_143 +KAI_ASM_LABEL(label_142) // Height 6: Multiply loop: Odd block loop + ldr s0, [x26], #0x4 + ldr s1, [x25], #0x4 + sub x27, x27, #0x1 + ldr s2, [x24], #0x4 + ldr s3, [x23], #0x4 + ldr s4, [x22], #0x4 + ldr s5, [x21], #0x4 + ldr q14, [x10, #0x0] + ldr q15, [x10, #0x10] + add x10, x10, #0x20 + fmla v20.4s, v14.4s, v0.s[0] + fmla v22.4s, v14.4s, v1.s[0] + fmla v24.4s, v14.4s, v2.s[0] + fmla v26.4s, v14.4s, v3.s[0] + fmla v28.4s, v14.4s, v4.s[0] + fmla v30.4s, v14.4s, v5.s[0] + fmla v21.4s, v15.4s, v0.s[0] + fmla v23.4s, v15.4s, v1.s[0] + fmla v25.4s, v15.4s, v2.s[0] + fmla v27.4s, v15.4s, v3.s[0] + fmla v29.4s, v15.4s, v4.s[0] + fmla v31.4s, v15.4s, v5.s[0] + cbnz x27, label_142 +KAI_ASM_LABEL(label_143) // Height 6: Multiply loop: No odd multiplies + ldr w20, [x2, #0x8] + add x28, x28, #0x1 + cmp x28, x20 + bne label_136 + ldr x20, [x2, #0x28] + prfm pstl1keep, [x9, #0x0] + add x26, x9, x20, LSL #2 + prfm pstl1keep, [x26, #0x0] + add x25, x26, x20, LSL #2 + prfm pstl1keep, [x25, #0x0] + add x24, x25, x20, LSL #2 + prfm pstl1keep, [x24, #0x0] + add x23, x24, x20, LSL #2 + add x22, x23, x20, LSL #2 + prfm pstl1keep, [x23, #0x0] + prfm pstl1keep, [x22, #0x0] + tbz x3, #1, label_144 + add x21, x2, #0x0 + add x20, x2, #0x4 + ld1r { v17.4s }, [x21] + ld1r { v16.4s }, [x20] + fmin v20.4s, v20.4s, v17.4s + fmin v21.4s, v21.4s, v17.4s + fmin v22.4s, v22.4s, v17.4s + fmin v23.4s, v23.4s, v17.4s + fmin v24.4s, v24.4s, v17.4s + fmin v25.4s, v25.4s, v17.4s + fmin v26.4s, v26.4s, v17.4s + fmin v27.4s, v27.4s, v17.4s + fmin v28.4s, v28.4s, v17.4s + fmin v29.4s, v29.4s, v17.4s + fmin v30.4s, v30.4s, v17.4s + fmin v31.4s, v31.4s, v17.4s + fmax v20.4s, v20.4s, v16.4s + fmax v21.4s, v21.4s, v16.4s + fmax v22.4s, v22.4s, v16.4s + fmax v23.4s, v23.4s, v16.4s + fmax v24.4s, v24.4s, v16.4s + fmax v25.4s, v25.4s, v16.4s + fmax v26.4s, v26.4s, v16.4s + fmax v27.4s, v27.4s, v16.4s + fmax v28.4s, v28.4s, v16.4s + fmax v29.4s, v29.4s, v16.4s + fmax v30.4s, v30.4s, v16.4s + fmax v31.4s, v31.4s, v16.4s +KAI_ASM_LABEL(label_144) // Height 6: No activation + cmp x11, #0x8 + bge label_149 + tbz x11, #2, label_146 + st1 { v20.4s }, [x9], #0x10 + st1 { v22.4s }, [x26], #0x10 + st1 { v24.4s }, [x25], #0x10 + st1 { v26.4s }, [x24], #0x10 + st1 { v28.4s }, [x23], #0x10 + st1 { v30.4s }, [x22], #0x10 + tbz x11, #1, label_145 + str d21, [x9], #0x8 + str d23, [x26], #0x8 + str d25, [x25], #0x8 + str d27, [x24], #0x8 + str d29, [x23], #0x8 + str d31, [x22], #0x8 + tbz x11, #0, label_148 + st1 { v21.s }[2], [x9] + st1 { v23.s }[2], [x26] + st1 { v25.s }[2], [x25] + st1 { v27.s }[2], [x24] + st1 { v29.s }[2], [x23] + st1 { v31.s }[2], [x22] + b label_148 +KAI_ASM_LABEL(label_145) // Height 6: Partial direct writeback: partial_1_4 + tbz x11, #0, label_148 + str s21, [x9, #0x0] + str s23, [x26, #0x0] + str s25, [x25, #0x0] + str s27, [x24, #0x0] + str s29, [x23, #0x0] + str s31, [x22, #0x0] + b label_148 +KAI_ASM_LABEL(label_146) // Height 6: Partial direct writeback: partial_2_0 + tbz x11, #1, label_147 + str d20, [x9], #0x8 + str d22, [x26], #0x8 + str d24, [x25], #0x8 + str d26, [x24], #0x8 + str d28, [x23], #0x8 + str d30, [x22], #0x8 + tbz x11, #0, label_148 + st1 { v20.s }[2], [x9] + st1 { v22.s }[2], [x26] + st1 { v24.s }[2], [x25] + st1 { v26.s }[2], [x24] + st1 { v28.s }[2], [x23] + st1 { v30.s }[2], [x22] + b label_148 +KAI_ASM_LABEL(label_147) // Height 6: Partial direct writeback: partial_1_0 + str s20, [x9, #0x0] + str s22, [x26, #0x0] + str s24, [x25, #0x0] + str s26, [x24, #0x0] + str s28, [x23, #0x0] + str s30, [x22, #0x0] +KAI_ASM_LABEL(label_148) // Height 6: Partial direct writeback: Done + b label_150 +KAI_ASM_LABEL(label_149) // Height 6: Full writeback + str q20, [x9, #0x0] + str q21, [x9, #0x10] + add x9, x9, #0x20 + str q22, [x26, #0x0] + str q23, [x26, #0x10] + str q24, [x25, #0x0] + str q25, [x25, #0x10] + str q26, [x24, #0x0] + str q27, [x24, #0x10] + str q28, [x23, #0x0] + str q29, [x23, #0x10] + str q30, [x22, #0x0] + str q31, [x22, #0x10] +KAI_ASM_LABEL(label_150) // Height 6: Writeback done + subs x11, x11, #0x8 + bgt label_127 + subs x1, x1, #0x6 + beq label_152 + ldr x21, [x2, #0x38] + tbz x3, #3, label_151 + add x21, x21, #0x6 + str x21, [x2, #0x38] + b label_1 +KAI_ASM_LABEL(label_151) // Update direct input + mov x20, #0x18 + madd x0, x20, x21, x0 + b label_1 +KAI_ASM_LABEL(label_152) // Exit + ldp x22, x23, [sp, 16] + ldp x24, x25, [sp, 32] + ldp x26, x27, [sp, 48] + ldr x28, [sp, 64] + ldp d10, d11, [sp, 72] + ldp d12, d13, [sp, 88] + ldp d14, d15, [sp, 104] + ldp d8, d9, [sp, 120] + ldp x20, x21, [sp], 144 + ret + + KAI_ASM_FOOTER