From fd5fc348167c276e2ae1bbb76391ec94c31833c4 Mon Sep 17 00:00:00 2001 From: Matthew Bentham Date: Fri, 11 Apr 2025 08:50:39 +0000 Subject: [PATCH 1/4] Remove use of bfcvt instruction from matmul_clamp_f32_qai8dxp_qsi4c32p tests This allows tests for those kernels to run when the CPU running the test does not have BFloat16 support. Use of bfcvt is replaced by simple truncation - this will effectively round towards 0 Signed-off-by: Matthew Bentham --- CMakeLists.txt | 1 - test/common/bfloat16.cpp | 10 ++++++- test/common/bfloat16.hpp | 13 ++------- test/common/bfloat16_asm.S | 18 ------------- ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 27 ++++++++----------- 5 files changed, 22 insertions(+), 47 deletions(-) delete mode 100644 test/common/bfloat16_asm.S diff --git a/CMakeLists.txt b/CMakeLists.txt index ad8f5580..22d650f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,7 +330,6 @@ if(KLEIDIAI_BUILD_TESTS) add_library(kleidiai_test_framework test/common/bfloat16.cpp - test/common/bfloat16_asm.S test/common/compare.cpp test/common/cpu_info.cpp test/common/data_format.cpp diff --git a/test/common/bfloat16.cpp b/test/common/bfloat16.cpp index 26e9259a..f035f014 100644 --- a/test/common/bfloat16.cpp +++ b/test/common/bfloat16.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -30,4 +30,12 @@ std::ostream& operator<<(std::ostream& os, BFloat16 value) { return os << static_cast(value); } +uint16_t BFloat16::kai_test_bfloat16_from_float(float value) { + uint32_t value_u32; + + memcpy(&value_u32, &value, sizeof(value)); + + return value_u32 >> 16; +} + } // namespace kai::test diff --git a/test/common/bfloat16.hpp b/test/common/bfloat16.hpp index e616657a..867edac5 100644 --- a/test/common/bfloat16.hpp +++ b/test/common/bfloat16.hpp @@ -13,17 +13,6 @@ #include "test/common/type_traits.hpp" -extern "C" { - -/// Converts single-precision floating-point to half-precision brain floating-point. -/// -/// @params[in] value The single-precision floating-point value. -/// -/// @return The half-precision brain floating-point value reinterpreted as 16-bit unsigned integer. -uint16_t kai_test_bfloat16_from_float(float value); - -} // extern "C" - namespace kai::test { /// Half-precision brain floating-point. @@ -73,6 +62,8 @@ private: /// @return The output stream. friend std::ostream& operator<<(std::ostream& os, BFloat16 value); + static uint16_t kai_test_bfloat16_from_float(float value); + uint16_t m_data; }; diff --git a/test/common/bfloat16_asm.S b/test/common/bfloat16_asm.S deleted file mode 100644 index 9f16cda4..00000000 --- a/test/common/bfloat16_asm.S +++ /dev/null @@ -1,18 +0,0 @@ -// -// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates -// -// SPDX-License-Identifier: Apache-2.0 -// - -#include "test/common/assembly.h" - - KAI_ASM_HEADER - - KAI_ASM_EXPORT(kai_test_bfloat16_from_float) - -KAI_ASM_FUNCTION(kai_test_bfloat16_from_float) - KAI_ASM_INST(0x1e634000) // bfcvt h0, s0 - fmov w0, h0 - ret - - KAI_ASM_FOOTER diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 954496bf..9a0d2469 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -50,35 +50,30 @@ #include "test/reference/transpose.hpp" namespace kai::test { - -// Test code contains calls to quantization functions using bf16. Additional check for BFloat16 CPU support required. -static auto cpu_has_dotprod_and_bf16 = []() { return cpu_has_dotprod() && cpu_has_bf16(); }; -static auto cpu_has_i8mm_and_bf16 = []() { return cpu_has_i8mm() && cpu_has_bf16(); }; - static const std::array, 11> variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p = { {{UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p8x4_1x8_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p4x8_1x4x32_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p4x4_16x4_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod), - "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod", cpu_has_dotprod_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp4x4_qsi4c32p8x4_4x8_neon_dotprod", cpu_has_dotprod}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm), - "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm", cpu_has_i8mm_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_8x4x32_neon_i8mm", cpu_has_i8mm}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm), - "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm", cpu_has_i8mm_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8x32_neon_i8mm", cpu_has_i8mm}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm), - "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm", cpu_has_i8mm_and_bf16}, + "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm", cpu_has_i8mm}, {UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm), - "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm", cpu_has_i8mm_and_bf16}}}; + "kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p8x8_4x8_neon_i8mm", cpu_has_i8mm}}}; using MatMulTestParams_withBL = std::tuple; -- GitLab From 089a49a883d4d41ab2d8b2a85b5e518e6ffb51b8 Mon Sep 17 00:00:00 2001 From: Matthew Bentham Date: Fri, 11 Apr 2025 09:14:15 +0000 Subject: [PATCH 2/4] Extend test framework BFloat16 tests Add a few tests for negative numbers, and allow running the tests on CPUs without bfloat16 hardware support (as that's not needed for the test framework code). Signed-off-by: Matthew Bentham --- test/tests/bfloat16_test.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/tests/bfloat16_test.cpp b/test/tests/bfloat16_test.cpp index 8a2886c8..996d2e68 100644 --- a/test/tests/bfloat16_test.cpp +++ b/test/tests/bfloat16_test.cpp @@ -13,13 +13,11 @@ namespace kai::test { TEST(BFloat16, SimpleTest) { - if (!cpu_has_bf16()) { - GTEST_SKIP() << "No CPU support for BFloat16"; - } - ASSERT_EQ(static_cast(BFloat16()), 0.0F); ASSERT_EQ(static_cast(BFloat16(1.25F)), 1.25F); + ASSERT_EQ(static_cast(BFloat16(-1.25F)), -1.25F); ASSERT_EQ(static_cast(BFloat16(3)), 3.0F); + ASSERT_EQ(static_cast(BFloat16(-3)), -3.0F); ASSERT_FALSE(BFloat16(1.25F) == BFloat16(2.0F)); ASSERT_TRUE(BFloat16(1.25F) == BFloat16(1.25F)); -- GitLab From b00c97a85192eb5f6adcca3b19f97238d7408bca Mon Sep 17 00:00:00 2001 From: Matthew Bentham Date: Mon, 14 Apr 2025 11:52:08 +0000 Subject: [PATCH 3/4] Rename kai_test_bfloat16_from_float to bfloat16_from_float Signed-off-by: Matthew Bentham --- test/common/bfloat16.cpp | 2 +- test/common/bfloat16.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/common/bfloat16.cpp b/test/common/bfloat16.cpp index f035f014..8f10ea40 100644 --- a/test/common/bfloat16.cpp +++ b/test/common/bfloat16.cpp @@ -30,7 +30,7 @@ std::ostream& operator<<(std::ostream& os, BFloat16 value) { return os << static_cast(value); } -uint16_t BFloat16::kai_test_bfloat16_from_float(float value) { +uint16_t BFloat16::bfloat16_from_float(float value) { uint32_t value_u32; memcpy(&value_u32, &value, sizeof(value)); diff --git a/test/common/bfloat16.hpp b/test/common/bfloat16.hpp index 867edac5..e91715f7 100644 --- a/test/common/bfloat16.hpp +++ b/test/common/bfloat16.hpp @@ -22,14 +22,14 @@ public: BFloat16() = default; /// Creates a new object from the specified numeric value. - explicit BFloat16(float value) : m_data(kai_test_bfloat16_from_float(value)) { + explicit BFloat16(float value) : m_data(bfloat16_from_float(value)) { } /// Assigns to the specified numeric value which will be converted to `bfloat16_t`. template , bool> = true> BFloat16& operator=(T value) { const auto value_f32 = static_cast(value); - m_data = kai_test_bfloat16_from_float(value_f32); + m_data = bfloat16_from_float(value_f32); return *this; } @@ -62,7 +62,7 @@ private: /// @return The output stream. friend std::ostream& operator<<(std::ostream& os, BFloat16 value); - static uint16_t kai_test_bfloat16_from_float(float value); + static uint16_t bfloat16_from_float(float value); uint16_t m_data; }; -- GitLab From 0993ad6226b3b7bef00b450ad1993d62df06d1c4 Mon Sep 17 00:00:00 2001 From: Matthew Bentham Date: Thu, 17 Apr 2025 12:34:53 +0000 Subject: [PATCH 4/4] Rename BFloat16::bfloat16_from_float Renamed to BFloat16::float_to_bfloat16_round_towards_zero Signed-off-by: Matthew Bentham --- test/common/bfloat16.cpp | 2 +- test/common/bfloat16.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/common/bfloat16.cpp b/test/common/bfloat16.cpp index 8f10ea40..bd70de26 100644 --- a/test/common/bfloat16.cpp +++ b/test/common/bfloat16.cpp @@ -30,7 +30,7 @@ std::ostream& operator<<(std::ostream& os, BFloat16 value) { return os << static_cast(value); } -uint16_t BFloat16::bfloat16_from_float(float value) { +uint16_t BFloat16::float_to_bfloat16_round_towards_zero(float value) { uint32_t value_u32; memcpy(&value_u32, &value, sizeof(value)); diff --git a/test/common/bfloat16.hpp b/test/common/bfloat16.hpp index e91715f7..a8ba195b 100644 --- a/test/common/bfloat16.hpp +++ b/test/common/bfloat16.hpp @@ -22,14 +22,14 @@ public: BFloat16() = default; /// Creates a new object from the specified numeric value. - explicit BFloat16(float value) : m_data(bfloat16_from_float(value)) { + explicit BFloat16(float value) : m_data(float_to_bfloat16_round_towards_zero(value)) { } /// Assigns to the specified numeric value which will be converted to `bfloat16_t`. template , bool> = true> BFloat16& operator=(T value) { const auto value_f32 = static_cast(value); - m_data = bfloat16_from_float(value_f32); + m_data = float_to_bfloat16_round_towards_zero(value_f32); return *this; } @@ -62,7 +62,7 @@ private: /// @return The output stream. friend std::ostream& operator<<(std::ostream& os, BFloat16 value); - static uint16_t bfloat16_from_float(float value); + static uint16_t float_to_bfloat16_round_towards_zero(float value); uint16_t m_data; }; -- GitLab