From d21426e15a2b46d18924c8e398eaffe4c6dce8e8 Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Fri, 17 Jan 2025 16:48:59 +0100 Subject: [PATCH 1/8] Add asm file for casting between f16 and f32 to core library Signed-off-by: Jens Elofsson --- CMakeLists.txt | 6 +++++ kai/common/assembly.h | 47 +++++++++++++++++++++++++++++++++ kai/common/float16_asm_common.S | 43 ++++++++++++++++++++++++++++++ kai/kai_common.h | 19 ++++++++++--- 4 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 kai/common/assembly.h create mode 100644 kai/common/float16_asm_common.S diff --git a/CMakeLists.txt b/CMakeLists.txt index 19675fbf..77c74153 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,6 +234,12 @@ target_compile_options(kleidiai PRIVATE ${KLEIDIAI_WARNING_FLAGS} ) +if(MSVC) + set_source_files_properties(kai/common/float16_asm_common.S PROPERTIES LANGUAGE ASM_MARMASM) + target_sources(kleidiai PUBLIC kai/common/float16_asm_common.S) +endif() + + if(KLEIDIAI_BUILD_TESTS) include(FetchGTest) enable_testing() diff --git a/kai/common/assembly.h b/kai/common/assembly.h new file mode 100644 index 00000000..e935083f --- /dev/null +++ b/kai/common/assembly.h @@ -0,0 +1,47 @@ +// +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef KAI_COMMON_ASSEMBLY_H +#define KAI_COMMON_ASSEMBLY_H + +// clang-format off + +#ifdef _MSC_VER + +#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 +#define KAI_ASM_LABEL(label) |label| +#define KAI_ASM_TARGET(label, direction) |label| +#define KAI_ASM_FUNCTION(label) |label| +#define KAI_ASM_EXPORT(label) global label +#define KAI_ASM_FOOTER end +#define KAI_ASM_INST(num) dcd num + +#else // _MSC_VER + +#define KAI_ASM_HEADER .text +#define KAI_ASM_LABEL(label) label: +#define KAI_ASM_TARGET(label, direction) label##direction + +#ifdef __APPLE__ +#define KAI_ASM_FUNCTION(label) _##label: +#define KAI_ASM_EXPORT(label) \ + .global _##label; \ + .type _##label, %function +#else // __APPLE__ +#define KAI_ASM_FUNCTION(label) label: +#define KAI_ASM_EXPORT(label) \ + .global label; \ + .type label, %function +#endif // __APPLE__ + +#define KAI_ASM_FOOTER +#define KAI_ASM_INST(num) .inst num + +#endif // _MSC_VER + +// clang-format on + +#endif // KAI_COMMON_ASSEMBLY_H diff --git a/kai/common/float16_asm_common.S b/kai/common/float16_asm_common.S new file mode 100644 index 00000000..bbf508b7 --- /dev/null +++ b/kai/common/float16_asm_common.S @@ -0,0 +1,43 @@ +; +; SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +; +; SPDX-License-Identifier: Apache-2.0 +; +#include "kai/common/assembly.h" + +#if defined(_MSC_VER) +# define KAI_ASM_CODE(name) AREA name, CODE, READONLY +# define KAI_ASM_LABEL(label) label +# define KAI_ASM_LABEL_GLOBAL(label) label +# define KAI_ASM_GLOBAL(symbol) global symbol +# define KAI_ASM_END end +#elif defined(__APPLE__) +# define KAI_ASM_CODE(name) .text +# define KAI_ASM_LABEL(label) _##label: +# define KAI_ASM_LABEL_GLOBAL(label) _##label: +# define KAI_ASM_GLOBAL(symbol) .global _##symbol +# define KAI_ASM_END +#else +# define KAI_ASM_CODE(name) .text +# define KAI_ASM_LABEL(label) label: +# define KAI_ASM_LABEL_GLOBAL(label) label: +# define KAI_ASM_GLOBAL(symbol) .global symbol +# define KAI_ASM_END +#endif + + KAI_ASM_CODE(kai_common_float16) + + KAI_ASM_GLOBAL(kai_common_float16_from_float) + KAI_ASM_GLOBAL(kai_common_float_from_float16) + +KAI_ASM_LABEL_GLOBAL(kai_common_float16_from_float) + fcvt h0, s0 + fmov w0, h0 + ret + +KAI_ASM_LABEL_GLOBAL(kai_common_float_from_float16) + fmov h0, w0 + fcvt s0, h0 + ret + + KAI_ASM_END diff --git a/kai/kai_common.h b/kai/kai_common.h index 79b08978..a359787b 100644 --- a/kai/kai_common.h +++ b/kai/kai_common.h @@ -5,7 +5,7 @@ // #pragma once -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || (defined(_M_ARM64) && defined(_WIN64)) #include #endif // defined(__ARM_NEON) @@ -85,14 +85,23 @@ inline static size_t kai_get_datatype_size_in_bytes(enum kai_datatype dt) { return (size_t)(dt >> 8); } +#if defined(_M_ARM64) && defined(_WIN64) +uint16_t kai_common_float16_from_float(float value); +float kai_common_float_from_float16(uint16_t value); +#endif + /// Converts a scalar f16 value to f32 /// @param[in] f16 The f16 value /// /// @return the f32 value -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || (defined(_M_ARM64) && defined(_WIN64)) inline static float kai_cast_f32_f16(uint16_t f16) { +#if defined(_M_ARM64) && defined(_WIN64) + float f32 = kai_common_float_from_float16(f16); +#else float16_t f32 = 0; memcpy(&f32, &f16, sizeof(uint16_t)); +#endif return (float)f32; } #endif @@ -127,11 +136,15 @@ inline static uint16_t kai_cast_bf16_f32(float f32) { /// @param[in] f32 The f32 value /// /// @return the f16 value -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(_M_ARM64) inline static uint16_t kai_cast_f16_f32(float f32) { uint16_t f16 = 0; +#if defined(_M_ARM64) && defined(_WIN64) + f16 = kai_common_float16_from_float(f32); +#else float16_t tmp = (float16_t)f32; memcpy(&f16, &tmp, sizeof(uint16_t)); +#endif return f16; } #endif -- GitLab From 7174a4cbed6b0efcdd99b1cb62f67aea68d3dec4 Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Tue, 21 Jan 2025 10:59:43 +0100 Subject: [PATCH 2/8] Adress review comments. Move the contents of assembly.h into the .S-file. Signed-off-by: Jens Elofsson --- kai/common/assembly.h | 47 --------------------------------- kai/common/float16_asm_common.S | 34 +++++++++++++++++++++++- 2 files changed, 33 insertions(+), 48 deletions(-) delete mode 100644 kai/common/assembly.h diff --git a/kai/common/assembly.h b/kai/common/assembly.h deleted file mode 100644 index e935083f..00000000 --- a/kai/common/assembly.h +++ /dev/null @@ -1,47 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#ifndef KAI_COMMON_ASSEMBLY_H -#define KAI_COMMON_ASSEMBLY_H - -// clang-format off - -#ifdef _MSC_VER - -#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 -#define KAI_ASM_LABEL(label) |label| -#define KAI_ASM_TARGET(label, direction) |label| -#define KAI_ASM_FUNCTION(label) |label| -#define KAI_ASM_EXPORT(label) global label -#define KAI_ASM_FOOTER end -#define KAI_ASM_INST(num) dcd num - -#else // _MSC_VER - -#define KAI_ASM_HEADER .text -#define KAI_ASM_LABEL(label) label: -#define KAI_ASM_TARGET(label, direction) label##direction - -#ifdef __APPLE__ -#define KAI_ASM_FUNCTION(label) _##label: -#define KAI_ASM_EXPORT(label) \ - .global _##label; \ - .type _##label, %function -#else // __APPLE__ -#define KAI_ASM_FUNCTION(label) label: -#define KAI_ASM_EXPORT(label) \ - .global label; \ - .type label, %function -#endif // __APPLE__ - -#define KAI_ASM_FOOTER -#define KAI_ASM_INST(num) .inst num - -#endif // _MSC_VER - -// clang-format on - -#endif // KAI_COMMON_ASSEMBLY_H diff --git a/kai/common/float16_asm_common.S b/kai/common/float16_asm_common.S index bbf508b7..de4b3909 100644 --- a/kai/common/float16_asm_common.S +++ b/kai/common/float16_asm_common.S @@ -3,7 +3,39 @@ ; ; SPDX-License-Identifier: Apache-2.0 ; -#include "kai/common/assembly.h" + +#ifdef _MSC_VER + +#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 +#define KAI_ASM_LABEL(label) |label| +#define KAI_ASM_TARGET(label, direction) |label| +#define KAI_ASM_FUNCTION(label) |label| +#define KAI_ASM_EXPORT(label) global label +#define KAI_ASM_FOOTER end +#define KAI_ASM_INST(num) dcd num + +#else // _MSC_VER + +#define KAI_ASM_HEADER .text +#define KAI_ASM_LABEL(label) label: +#define KAI_ASM_TARGET(label, direction) label##direction + +#ifdef __APPLE__ +#define KAI_ASM_FUNCTION(label) _##label: +#define KAI_ASM_EXPORT(label) \ + .global _##label; \ + .type _##label, %function +#else // __APPLE__ +#define KAI_ASM_FUNCTION(label) label: +#define KAI_ASM_EXPORT(label) \ + .global label; \ + .type label, %function +#endif // __APPLE__ + +#define KAI_ASM_FOOTER +#define KAI_ASM_INST(num) .inst num + +#endif // _MSC_VER #if defined(_MSC_VER) # define KAI_ASM_CODE(name) AREA name, CODE, READONLY -- GitLab From f3f6a15f380f2f0661ece1379b76dff53d8265c2 Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Wed, 22 Jan 2025 11:04:02 +0100 Subject: [PATCH 3/8] Address review comments Change filename of assembly file. Signed-off-by: Jens Elofsson --- CMakeLists.txt | 4 ++-- kai/common/{float16_asm_common.S => kai_asm_typecast.S} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename kai/common/{float16_asm_common.S => kai_asm_typecast.S} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77c74153..b5e35232 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,8 +235,8 @@ target_compile_options(kleidiai ) if(MSVC) - set_source_files_properties(kai/common/float16_asm_common.S PROPERTIES LANGUAGE ASM_MARMASM) - target_sources(kleidiai PUBLIC kai/common/float16_asm_common.S) + set_source_files_properties(kai/common/kai_asm_typecast.S PROPERTIES LANGUAGE ASM_MARMASM) + target_sources(kleidiai PUBLIC kai/common/kai_asm_typecast.S) endif() diff --git a/kai/common/float16_asm_common.S b/kai/common/kai_asm_typecast.S similarity index 100% rename from kai/common/float16_asm_common.S rename to kai/common/kai_asm_typecast.S -- GitLab From 3fc07e55ec32f6c038d6d9a440c137c232a593b2 Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Thu, 23 Jan 2025 13:52:48 +0100 Subject: [PATCH 4/8] Address review comments. - Move the asm typecasting implementation to individual .S-files for each of the kernels and remove the original .S-file - Remove the asm that was added to kai_common.h and it's related changes to CMakeLists.txt Signed-off-by: Jens Elofsson --- CMakeLists.txt | 8 +-- kai/kai_common.h | 19 +---- .../kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c | 5 +- ..._rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S} | 24 +++---- .../kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c | 5 +- ...i_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S | 69 +++++++++++++++++++ 6 files changed, 91 insertions(+), 39 deletions(-) rename kai/{common/kai_asm_typecast.S => ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S} (77%) create mode 100644 kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S diff --git a/CMakeLists.txt b/CMakeLists.txt index b5e35232..5f426c31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,10 +88,12 @@ set(KLEIDIAI_FILES_SCALAR kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c + kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c + kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S ) set(KLEIDIAI_FILES_NEON_FP16 @@ -234,12 +236,6 @@ target_compile_options(kleidiai PRIVATE ${KLEIDIAI_WARNING_FLAGS} ) -if(MSVC) - set_source_files_properties(kai/common/kai_asm_typecast.S PROPERTIES LANGUAGE ASM_MARMASM) - target_sources(kleidiai PUBLIC kai/common/kai_asm_typecast.S) -endif() - - if(KLEIDIAI_BUILD_TESTS) include(FetchGTest) enable_testing() diff --git a/kai/kai_common.h b/kai/kai_common.h index a359787b..79b08978 100644 --- a/kai/kai_common.h +++ b/kai/kai_common.h @@ -5,7 +5,7 @@ // #pragma once -#if defined(__ARM_NEON) || (defined(_M_ARM64) && defined(_WIN64)) +#if defined(__ARM_NEON) #include #endif // defined(__ARM_NEON) @@ -85,23 +85,14 @@ inline static size_t kai_get_datatype_size_in_bytes(enum kai_datatype dt) { return (size_t)(dt >> 8); } -#if defined(_M_ARM64) && defined(_WIN64) -uint16_t kai_common_float16_from_float(float value); -float kai_common_float_from_float16(uint16_t value); -#endif - /// Converts a scalar f16 value to f32 /// @param[in] f16 The f16 value /// /// @return the f32 value -#if defined(__ARM_NEON) || (defined(_M_ARM64) && defined(_WIN64)) +#if defined(__ARM_NEON) inline static float kai_cast_f32_f16(uint16_t f16) { -#if defined(_M_ARM64) && defined(_WIN64) - float f32 = kai_common_float_from_float16(f16); -#else float16_t f32 = 0; memcpy(&f32, &f16, sizeof(uint16_t)); -#endif return (float)f32; } #endif @@ -136,15 +127,11 @@ inline static uint16_t kai_cast_bf16_f32(float f32) { /// @param[in] f32 The f32 value /// /// @return the f16 value -#if defined(__ARM_NEON) || defined(_M_ARM64) +#if defined(__ARM_NEON) inline static uint16_t kai_cast_f16_f32(float f32) { uint16_t f16 = 0; -#if defined(_M_ARM64) && defined(_WIN64) - f16 = kai_common_float16_from_float(f32); -#else float16_t tmp = (float16_t)f32; memcpy(&f16, &tmp, sizeof(uint16_t)); -#endif return f16; } #endif diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c index 608de410..92c34e9a 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c @@ -11,6 +11,8 @@ #include "kai/kai_common.h" +float kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_float_from_float16(uint16_t value); + static const size_t kai_num_bytes_sum_rhs = sizeof(float); static const size_t kai_num_bytes_bias = sizeof(float); static const size_t kai_nr_multiple_of = 4; @@ -204,7 +206,8 @@ void kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( d = ((float*)rhs_packed_scale)[nr_idx]; break; case kai_dt_f16: - d = kai_cast_f32_f16(((uint16_t*)rhs_packed_scale)[nr_idx]); + d = kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_float_from_float16( + ((uint16_t*)rhs_packed_scale)[nr_idx]); break; case kai_dt_bf16: d = kai_cast_f32_bf16(((uint16_t*)rhs_packed_scale)[nr_idx]); diff --git a/kai/common/kai_asm_typecast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S similarity index 77% rename from kai/common/kai_asm_typecast.S rename to kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S index de4b3909..83be1d11 100644 --- a/kai/common/kai_asm_typecast.S +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S @@ -1,8 +1,8 @@ -; -; SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates -; -; SPDX-License-Identifier: Apache-2.0 -; +// +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// #ifdef _MSC_VER @@ -57,19 +57,13 @@ # define KAI_ASM_END #endif - KAI_ASM_CODE(kai_common_float16) + KAI_ASM_CODE(kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_float16) - KAI_ASM_GLOBAL(kai_common_float16_from_float) - KAI_ASM_GLOBAL(kai_common_float_from_float16) + KAI_ASM_GLOBAL(kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_float_from_float16) -KAI_ASM_LABEL_GLOBAL(kai_common_float16_from_float) +KAI_ASM_LABEL_GLOBAL(kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_float_from_float16) fcvt h0, s0 - fmov w0, h0 - ret - -KAI_ASM_LABEL_GLOBAL(kai_common_float_from_float16) - fmov h0, w0 - fcvt s0, h0 + fmov w0, s0 ret KAI_ASM_END diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c index 5a86b8a0..ae59f615 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c @@ -11,6 +11,8 @@ #include "kai/kai_common.h" +float kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_float_from_float16(uint16_t value); + static const size_t kai_num_bytes_sum_rhs = sizeof(float); static const size_t kai_num_bytes_bias = sizeof(float); static const size_t kai_nr_multiple_of = 4; @@ -197,7 +199,8 @@ void kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( d = ((float*)rhs_packed_scale)[nr_idx]; break; case kai_dt_f16: - d = kai_cast_f32_f16(((uint16_t*)rhs_packed_scale)[nr_idx]); + d = kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_float_from_float16( + ((uint16_t*)rhs_packed_scale)[nr_idx]); break; case kai_dt_bf16: d = kai_cast_f32_bf16(((uint16_t*)rhs_packed_scale)[nr_idx]); diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S new file mode 100644 index 00000000..d6b4c5b4 --- /dev/null +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S @@ -0,0 +1,69 @@ +// +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef _MSC_VER + +#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 +#define KAI_ASM_LABEL(label) |label| +#define KAI_ASM_TARGET(label, direction) |label| +#define KAI_ASM_FUNCTION(label) |label| +#define KAI_ASM_EXPORT(label) global label +#define KAI_ASM_FOOTER end +#define KAI_ASM_INST(num) dcd num + +#else // _MSC_VER + +#define KAI_ASM_HEADER .text +#define KAI_ASM_LABEL(label) label: +#define KAI_ASM_TARGET(label, direction) label##direction + +#ifdef __APPLE__ +#define KAI_ASM_FUNCTION(label) _##label: +#define KAI_ASM_EXPORT(label) \ + .global _##label; \ + .type _##label, %function +#else // __APPLE__ +#define KAI_ASM_FUNCTION(label) label: +#define KAI_ASM_EXPORT(label) \ + .global label; \ + .type label, %function +#endif // __APPLE__ + +#define KAI_ASM_FOOTER +#define KAI_ASM_INST(num) .inst num + +#endif // _MSC_VER + +#if defined(_MSC_VER) +# define KAI_ASM_CODE(name) AREA name, CODE, READONLY +# define KAI_ASM_LABEL(label) label +# define KAI_ASM_LABEL_GLOBAL(label) label +# define KAI_ASM_GLOBAL(symbol) global symbol +# define KAI_ASM_END end +#elif defined(__APPLE__) +# define KAI_ASM_CODE(name) .text +# define KAI_ASM_LABEL(label) _##label: +# define KAI_ASM_LABEL_GLOBAL(label) _##label: +# define KAI_ASM_GLOBAL(symbol) .global _##symbol +# define KAI_ASM_END +#else +# define KAI_ASM_CODE(name) .text +# define KAI_ASM_LABEL(label) label: +# define KAI_ASM_LABEL_GLOBAL(label) label: +# define KAI_ASM_GLOBAL(symbol) .global symbol +# define KAI_ASM_END +#endif + + KAI_ASM_CODE(kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_float16) + + KAI_ASM_GLOBAL(kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_float_from_float16) + +KAI_ASM_LABEL_GLOBAL(kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_float_from_float16) + fcvt h0, s0 + fmov w0, s0 + ret + + KAI_ASM_END -- GitLab From 0cd07a01c0b2030ea1f572ee7031ef9c2a9cf2bf Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Thu, 23 Jan 2025 15:19:38 +0100 Subject: [PATCH 5/8] Address review comments - Add CHANGELOG.md entry Signed-off-by: Jens Elofsson --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f50c025a..d2138788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Remove `-Werror` from default build flags as to not cause integration problems - Expose the rhs_packed_stride in the header file - Fix validation error when n > nr in kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa +- Add MSVC support for `kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0` and `kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0` packing kernels. ## v1.2.0 -- GitLab From 7fa27b84b1b79fb20237460f1ba13a8444b1f50d Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Fri, 24 Jan 2025 09:55:49 +0100 Subject: [PATCH 6/8] Various changes to fix CI pipeline build errors - Add rhs packing assembly files to matmul_clamp_f32_qai8dxp_qsi4c32p example - Add rhs packing assembly files to Bazel - Remove duplicate definitions of KAI_ASM_* defines Signed-off-by: Jens Elofsson --- .../CMakeLists.txt | 4 +++ kai/ukernels/matmul/BUILD.bazel | 12 +++++++ ...i_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S | 33 ------------------- ...i_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S | 33 ------------------- 4 files changed, 16 insertions(+), 66 deletions(-) diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt index 28bbd67c..9fbf628c 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt @@ -14,6 +14,8 @@ set(KLEIDIAI_PATH ../../) set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/) set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/) +enable_language(ASM) + # KleidiAI include directories include_directories( ${KLEIDIAI_PATH} @@ -25,7 +27,9 @@ add_executable(matmul_clamp_f32_qai8dxp_qsi4c32p matmul_clamp_f32_qai8dxp_qsi4c32p.cpp ${KLEIDIAI_PATH}/kai/kai_common.h ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c + ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c + ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qai8dxp_f32.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_asm.S diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 97f12218..2bd0fbd7 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -44,6 +44,11 @@ NEON_KERNELS_ASM = [ "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla", ] +NEON_KERNELS_CAST_ASM = [ + "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0", + "pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0", +] + # buildifier: keep sorted FP16_KERNELS = [ "matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla", @@ -168,6 +173,12 @@ kai_c_library( cpu_uarch = kai_cpu_neon(), ) +kai_c_library( + name = "neon_impl_cast_asm", + srcs = [ukernel + "_cast.S" for ukernel in NEON_KERNELS_CAST_ASM], + cpu_uarch = kai_cpu_neon(), +) + kai_c_library( name = "fp16_impl", srcs = [ukernel + ".c" for ukernel in FP16_KERNELS], @@ -243,6 +254,7 @@ kai_c_library( ":interface", ":neon_impl", ":neon_impl_asm", + ":neon_impl_cast_asm", ":scalar_impl", ":sme2_impl", ":sme_impl", diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S index 83be1d11..06a10a04 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S @@ -4,39 +4,6 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef _MSC_VER - -#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 -#define KAI_ASM_LABEL(label) |label| -#define KAI_ASM_TARGET(label, direction) |label| -#define KAI_ASM_FUNCTION(label) |label| -#define KAI_ASM_EXPORT(label) global label -#define KAI_ASM_FOOTER end -#define KAI_ASM_INST(num) dcd num - -#else // _MSC_VER - -#define KAI_ASM_HEADER .text -#define KAI_ASM_LABEL(label) label: -#define KAI_ASM_TARGET(label, direction) label##direction - -#ifdef __APPLE__ -#define KAI_ASM_FUNCTION(label) _##label: -#define KAI_ASM_EXPORT(label) \ - .global _##label; \ - .type _##label, %function -#else // __APPLE__ -#define KAI_ASM_FUNCTION(label) label: -#define KAI_ASM_EXPORT(label) \ - .global label; \ - .type label, %function -#endif // __APPLE__ - -#define KAI_ASM_FOOTER -#define KAI_ASM_INST(num) .inst num - -#endif // _MSC_VER - #if defined(_MSC_VER) # define KAI_ASM_CODE(name) AREA name, CODE, READONLY # define KAI_ASM_LABEL(label) label diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S index d6b4c5b4..dd4b68c1 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S @@ -4,39 +4,6 @@ // SPDX-License-Identifier: Apache-2.0 // -#ifdef _MSC_VER - -#define KAI_ASM_HEADER AREA |.text|, CODE, READONLY, ALIGN=4 -#define KAI_ASM_LABEL(label) |label| -#define KAI_ASM_TARGET(label, direction) |label| -#define KAI_ASM_FUNCTION(label) |label| -#define KAI_ASM_EXPORT(label) global label -#define KAI_ASM_FOOTER end -#define KAI_ASM_INST(num) dcd num - -#else // _MSC_VER - -#define KAI_ASM_HEADER .text -#define KAI_ASM_LABEL(label) label: -#define KAI_ASM_TARGET(label, direction) label##direction - -#ifdef __APPLE__ -#define KAI_ASM_FUNCTION(label) _##label: -#define KAI_ASM_EXPORT(label) \ - .global _##label; \ - .type _##label, %function -#else // __APPLE__ -#define KAI_ASM_FUNCTION(label) label: -#define KAI_ASM_EXPORT(label) \ - .global label; \ - .type label, %function -#endif // __APPLE__ - -#define KAI_ASM_FOOTER -#define KAI_ASM_INST(num) .inst num - -#endif // _MSC_VER - #if defined(_MSC_VER) # define KAI_ASM_CODE(name) AREA name, CODE, READONLY # define KAI_ASM_LABEL(label) label -- GitLab From 45b28ab94ed5e1fdf602f981a2e5f8f3b3c24dbd Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Fri, 24 Jan 2025 15:39:57 +0100 Subject: [PATCH 7/8] Adress review comments - Rename assembly files to end in _asm.S Signed-off-by: Jens Elofsson --- CMakeLists.txt | 4 ++-- examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt | 4 ++-- kai/ukernels/matmul/BUILD.bazel | 2 +- ...ast.S => kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S} | 0 ...ast.S => kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S} | 0 5 files changed, 5 insertions(+), 5 deletions(-) rename kai/ukernels/matmul/pack/{kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S => kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S} (100%) rename kai/ukernels/matmul/pack/{kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S => kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f426c31..3fd05182 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,12 +88,12 @@ set(KLEIDIAI_FILES_SCALAR kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c - kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S + kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c - kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S + kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S ) set(KLEIDIAI_FILES_NEON_FP16 diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt index 9fbf628c..bec71b04 100644 --- a/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt +++ b/examples/matmul_clamp_f32_qai8dxp_qsi4c32p/CMakeLists.txt @@ -27,9 +27,9 @@ add_executable(matmul_clamp_f32_qai8dxp_qsi4c32p matmul_clamp_f32_qai8dxp_qsi4c32p.cpp ${KLEIDIAI_PATH}/kai/kai_common.h ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.c - ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S + ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.c - ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S + ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qai8dxp_f32.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c ${MATMUL_PATH}/kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod_asm.S diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 2bd0fbd7..41de55ac 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -175,7 +175,7 @@ kai_c_library( kai_c_library( name = "neon_impl_cast_asm", - srcs = [ukernel + "_cast.S" for ukernel in NEON_KERNELS_CAST_ASM], + srcs = [ukernel + "_cast_asm.S" for ukernel in NEON_KERNELS_CAST_ASM], cpu_uarch = kai_cpu_neon(), ) diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S similarity index 100% rename from kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast.S rename to kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast_asm.S diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S similarity index 100% rename from kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast.S rename to kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast_asm.S -- GitLab From 5cfc860daa73d0f34be17e61a41123a133c1dd29 Mon Sep 17 00:00:00 2001 From: Jens Elofsson Date: Mon, 27 Jan 2025 13:52:19 +0100 Subject: [PATCH 8/8] Address review comments - Move kernels into existing NEON_KERNELS_ASM list in Bazel build file. - More detailed changelog entry. Signed-off-by: Jens Elofsson --- CHANGELOG.md | 1 + kai/ukernels/matmul/BUILD.bazel | 14 ++------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2138788..592d1389 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ KleidiAI follows the [Semantic Versioning](https://semver.org/) specification fo - Expose the rhs_packed_stride in the header file - Fix validation error when n > nr in kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa - Add MSVC support for `kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0` and `kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0` packing kernels. + - Add assembler implementation of f32 to f16 typecasting to avoid use of float16_t. ## v1.2.0 diff --git a/kai/ukernels/matmul/BUILD.bazel b/kai/ukernels/matmul/BUILD.bazel index 41de55ac..d3999867 100644 --- a/kai/ukernels/matmul/BUILD.bazel +++ b/kai/ukernels/matmul/BUILD.bazel @@ -42,11 +42,8 @@ NEON_KERNELS = [ NEON_KERNELS_ASM = [ "matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla", -] - -NEON_KERNELS_CAST_ASM = [ - "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0", - "pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0", + "pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_cast", + "pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_cast", ] # buildifier: keep sorted @@ -173,12 +170,6 @@ kai_c_library( cpu_uarch = kai_cpu_neon(), ) -kai_c_library( - name = "neon_impl_cast_asm", - srcs = [ukernel + "_cast_asm.S" for ukernel in NEON_KERNELS_CAST_ASM], - cpu_uarch = kai_cpu_neon(), -) - kai_c_library( name = "fp16_impl", srcs = [ukernel + ".c" for ukernel in FP16_KERNELS], @@ -254,7 +245,6 @@ kai_c_library( ":interface", ":neon_impl", ":neon_impl_asm", - ":neon_impl_cast_asm", ":scalar_impl", ":sme2_impl", ":sme_impl", -- GitLab