From 062a41a0d8be74ba2585adfe419cb84809823b22 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Wed, 9 Jul 2025 19:43:42 +0100 Subject: [PATCH 1/3] Update kai_kernel_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa * Update the asm kernel to multiply the zero-points and sum as integers instead of float. Signed-off-by: Anitha Raj --- ...1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S index 68891f76..d7083c93 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S @@ -94,7 +94,6 @@ KAI_ASM_LABEL(label_3) // Block Loop KAI_ASM_INST(0xa041c564) // ld1w { z4.s - z7.s }, pn9/z, [x11, #0x4, mul vl] KAI_ASM_INST(0xa042c568) // ld1w { z8.s - z11.s }, pn9/z, [x11, #0x8, mul vl] addvl x11, x11, #0xc - KAI_ASM_INST(0xc132e000) // scvtf { z0.s - z3.s }, { z0.s - z3.s } mov x14, #0x0 // =0 addvl x15, x10, #0x1 KAI_ASM_LABEL(label_4) @@ -102,25 +101,20 @@ KAI_ASM_LABEL(label_4) ld1rw { z17.s }, p2/z, [x15] add x10, x10, #0x4 add x15, x15, #0x4 - scvtf z16.s, p2/m, z16.s - fmul z24.s, z16.s, z0.s - fmul z25.s, z16.s, z1.s - fmul z26.s, z16.s, z2.s - fmul z27.s, z16.s, z3.s + KAI_ASM_INST(0xc006440c) // mov { z12.b - z15.b }, za0h.b[w14, 0x0:0x3] + mla z12.s, p2/m, z16.s, z0.s + mla z13.s, p2/m, z16.s, z1.s + mla z14.s, p2/m, z16.s, z2.s + mla z15.s, p2/m, z16.s, z3.s + KAI_ASM_INST(0xc132e18c) // scvtf { z12.s - z15.s }, { z12.s - z15.s } fmul z20.s, z17.s, z4.s fmul z21.s, z17.s, z5.s fmul z22.s, z17.s, z6.s fmul z23.s, z17.s, z7.s - fmul z24.s, z24.s, z20.s - fmul z25.s, z25.s, z21.s - fmul z26.s, z26.s, z22.s - fmul z27.s, z27.s, z23.s - KAI_ASM_INST(0xc006440c) // mov { z12.b - z15.b }, za0h.b[w14, 0x0:0x3] - KAI_ASM_INST(0xc132e18c) // scvtf { z12.s - z15.s }, { z12.s - z15.s } - fmla z24.s, p2/m, z20.s, z12.s - fmla z25.s, p2/m, z21.s, z13.s - fmla z26.s, p2/m, z22.s, z14.s - fmla z27.s, p2/m, z23.s, z15.s + fmul z24.s, z20.s, z12.s + fmul z25.s, z21.s, z13.s + fmul z26.s, z22.s, z14.s + fmul z27.s, z23.s, z15.s fadd z24.s, p2/m, z24.s, z8.s fadd z25.s, p2/m, z25.s, z9.s fadd z26.s, p2/m, z26.s, z10.s -- GitLab From a08a5f76e93d61af83fb4bd16d94bf67df30ade5 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Fri, 11 Jul 2025 11:43:52 +0100 Subject: [PATCH 2/3] Update Changelog Signed-off-by: Anitha Raj --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9d1e63e..12721691 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - New SME micro-kernels: - Matrix multiplication (1xN) of F32 LHS and RHS with F32 output, using instructions compatible with FEAT_SME. - Matrix multiplication (1xN) of F16 LHS and RHS with F16 output, using instructions compatible with FEAT_SME. +- Fixes + - Update kai_kernel_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa to improve accuracy ## v1.11.0 -- GitLab From 387eaccc5f8aa13e2894f5a8062a1ed2e65d5cdf Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Mon, 14 Jul 2025 13:04:39 +0100 Subject: [PATCH 3/3] Shuffle the instructions to hide latency Signed-off-by: Anitha Raj --- ..._f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S index d7083c93..1c2fb40a 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S @@ -102,15 +102,15 @@ KAI_ASM_LABEL(label_4) add x10, x10, #0x4 add x15, x15, #0x4 KAI_ASM_INST(0xc006440c) // mov { z12.b - z15.b }, za0h.b[w14, 0x0:0x3] + fmul z20.s, z17.s, z4.s + fmul z21.s, z17.s, z5.s + fmul z22.s, z17.s, z6.s + fmul z23.s, z17.s, z7.s mla z12.s, p2/m, z16.s, z0.s mla z13.s, p2/m, z16.s, z1.s mla z14.s, p2/m, z16.s, z2.s mla z15.s, p2/m, z16.s, z3.s KAI_ASM_INST(0xc132e18c) // scvtf { z12.s - z15.s }, { z12.s - z15.s } - fmul z20.s, z17.s, z4.s - fmul z21.s, z17.s, z5.s - fmul z22.s, z17.s, z6.s - fmul z23.s, z17.s, z7.s fmul z24.s, z20.s, z12.s fmul z25.s, z21.s, z13.s fmul z26.s, z22.s, z14.s -- GitLab