From d770c069c489d078b605116b9029678bb7ddc506 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 6 May 2025 17:31:24 +0100 Subject: [PATCH 1/3] Use new Buffer class for the entire test framework * Replace `std::vector` by `Buffer` class. * Update `Buffer` class: - Add support for initial value of the buffer. - Always initialize the buffer with 0 by default. * Add `pad_matrix` reference function to support extending the data buffer. Signed-off-by: Viet-Hoa Do --- test/common/buffer.cpp | 4 +- test/common/buffer.hpp | 7 +- test/common/int4.cpp | 5 +- test/common/int4.hpp | 4 +- test/reference/binary_elementwise.cpp | 25 ++++--- test/reference/binary_elementwise.hpp | 15 ++-- test/reference/cast.cpp | 17 +++-- test/reference/cast.hpp | 9 ++- test/reference/clamp.cpp | 13 ++-- test/reference/clamp.hpp | 5 +- test/reference/fill.cpp | 24 +++--- test/reference/fill.hpp | 8 +- test/reference/matmul.cpp | 74 +++++++++---------- test/reference/matmul.hpp | 15 ++-- test/reference/matmul_pack.cpp | 12 ++- test/reference/matmul_pack.hpp | 6 +- test/reference/pack.cpp | 42 +++++------ test/reference/pack.hpp | 15 ++-- test/reference/pad.cpp | 44 +++++++++-- test/reference/pad.hpp | 22 +++++- test/reference/quantize.cpp | 69 +++++++++-------- test/reference/quantize.hpp | 17 +++-- test/reference/reduce.cpp | 33 ++++----- test/reference/reduce.hpp | 6 +- test/reference/reorder.cpp | 10 +-- test/reference/reorder.hpp | 7 +- test/reference/transpose.cpp | 24 +++--- test/reference/transpose.hpp | 9 ++- test/tests/buffer_test.cpp | 6 +- test/tests/imatmul_test.cpp | 4 +- .../matmul_clamp_f16_bf16p_bf16p_test.cpp | 33 ++++----- .../matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp | 11 +-- .../matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp | 9 ++- ...atmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp | 15 ++-- .../matmul_clamp_f32_bf16p_bf16p_test.cpp | 39 +++++----- test/tests/matmul_clamp_f32_f32_f32p_test.cpp | 4 +- ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 19 +++-- .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 41 +++++----- .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp | 13 ++-- ...atmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp | 15 ++-- ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 10 ++- .../matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp | 18 +++-- test/tests/matmul_test.cpp | 52 +++++++------ 43 files changed, 456 insertions(+), 374 deletions(-) diff --git a/test/common/buffer.cpp b/test/common/buffer.cpp index 65c9e261..3945eba2 100644 --- a/test/common/buffer.cpp +++ b/test/common/buffer.cpp @@ -21,7 +21,7 @@ namespace kai::test { -Buffer::Buffer(const size_t size) : m_user_buffer_size(size) { +Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size) { KAI_ASSUME_MSG(size > 0, "Buffers must be of non-zero size"); const char* val = getenv("KAI_TEST_BUFFER_POLICY"); @@ -57,6 +57,8 @@ Buffer::Buffer(const size_t size) : m_user_buffer_size(size) { default: allocate(); } + + memset(data(), init_value, size); } void Buffer::allocate() { diff --git a/test/common/buffer.hpp b/test/common/buffer.hpp index a2226f53..4cb06e4b 100644 --- a/test/common/buffer.hpp +++ b/test/common/buffer.hpp @@ -29,7 +29,8 @@ class Buffer { using handle = std::unique_ptr>; public: - explicit Buffer(size_t size); + Buffer() = default; + Buffer(size_t size, uint8_t init_value = 0); Buffer(const Buffer& other) = delete; Buffer(Buffer&& other) noexcept = default; @@ -41,7 +42,7 @@ public: /// Gets the base memory address of the user buffer. /// /// @return Base memory address of the user buffer. - [[nodiscard]] void* data() const { + [[nodiscard]] std::byte* data() const { return static_cast(m_buffer.get()) + m_user_buffer_offset; } @@ -81,7 +82,7 @@ private: handle m_buffer = nullptr; - size_t m_user_buffer_size; + size_t m_user_buffer_size = 0; size_t m_user_buffer_offset = 0; BufferProtectionPolicy m_protection_policy = BufferProtectionPolicy::None; diff --git a/test/common/int4.cpp b/test/common/int4.cpp index ff64de20..f0e1a56d 100644 --- a/test/common/int4.cpp +++ b/test/common/int4.cpp @@ -11,6 +11,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/memory.hpp" namespace kai::test { @@ -115,9 +116,9 @@ std::tuple Int4::unpack_u8(uint8_t value) { // ===================================================================================================================== -std::vector convert_s0s1_s1s0(const std::vector& src) { +Buffer convert_s0s1_s1s0(const Buffer& src) { const auto length = src.size(); - std::vector dst(length); + Buffer dst(length); for (size_t i = 0; i < length; ++i) { uint8_t val = read_array(src.data(), i); diff --git a/test/common/int4.hpp b/test/common/int4.hpp index 1d9ba8b5..aa05d9bd 100644 --- a/test/common/int4.hpp +++ b/test/common/int4.hpp @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { /// 4-bit unsigned integer. @@ -121,6 +123,6 @@ private: /// @param[in] src The data buffer. /// /// @return The buffer with packed byte, where the high and low nibbles reversed. -std::vector convert_s0s1_s1s0(const std::vector& src); +Buffer convert_s0s1_s1s0(const Buffer& src); } // namespace kai::test diff --git a/test/reference/binary_elementwise.cpp b/test/reference/binary_elementwise.cpp index 803d87fb..48434e47 100644 --- a/test/reference/binary_elementwise.cpp +++ b/test/reference/binary_elementwise.cpp @@ -12,6 +12,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" #include "test/common/float16.hpp" #include "test/common/int4.hpp" @@ -67,13 +68,13 @@ T scalar_binary_elementwise(T lhs, T rhs) { /// /// @return The result data buffer. template -std::vector binary_elementwise_any_op_type( +Buffer binary_elementwise_any_op_type( const void* lhs, const void* rhs, size_t lhs_height, size_t lhs_width, size_t rhs_height, size_t rhs_width) { const auto height = std::max(lhs_height, rhs_height); const auto width = std::max(lhs_width, rhs_width); KAI_ASSUME(width * size_in_bits % 8 == 0); - std::vector dst(height * width * size_in_bits / 8); + Buffer dst(height * width * size_in_bits / 8); for (size_t y = 0; y < height; ++y) { for (size_t x = 0; x < width; ++x) { @@ -94,7 +95,7 @@ std::vector binary_elementwise_any_op_type( } template -std::vector binary_elementwise_any_type( +Buffer binary_elementwise_any_type( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) { KAI_ASSUME(lhs_dt == rhs_dt); @@ -121,14 +122,14 @@ std::vector binary_elementwise_any_type( } // namespace -std::vector add( +Buffer add( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_type( lhs, lhs_dt, lhs_height, lhs_width, rhs, rhs_dt, rhs_height, rhs_width); } -std::vector sub( +Buffer sub( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_type( @@ -136,18 +137,18 @@ std::vector sub( } template -std::vector sub( +Buffer sub( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_op_type( lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width); } -template std::vector sub( +template Buffer sub( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width); -std::vector mul( +Buffer mul( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_type( @@ -155,22 +156,22 @@ std::vector mul( } template -std::vector mul( +Buffer mul( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_op_type( lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width); } -template std::vector mul( +template Buffer mul( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width); -template std::vector mul( +template Buffer mul( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width); -std::vector div( +Buffer div( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) { return binary_elementwise_any_type( diff --git a/test/reference/binary_elementwise.hpp b/test/reference/binary_elementwise.hpp index f2f5c0ab..713f8692 100644 --- a/test/reference/binary_elementwise.hpp +++ b/test/reference/binary_elementwise.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -28,7 +29,7 @@ namespace kai::test { /// @param[in] rhs_width RHS width. /// /// @return The result matrix. -std::vector add( +Buffer add( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width); @@ -46,7 +47,7 @@ std::vector add( /// @param[in] rhs_width RHS width. /// /// @return The result matrix. -std::vector sub( +Buffer sub( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width); @@ -65,7 +66,7 @@ std::vector sub( /// /// @return The result matrix. template -std::vector sub( +Buffer sub( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width); @@ -83,7 +84,7 @@ std::vector sub( /// @param[in] rhs_width RHS width. /// /// @return The result matrix. -std::vector mul( +Buffer mul( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width); @@ -102,7 +103,7 @@ std::vector mul( /// /// @return The result matrix. template -std::vector mul( +Buffer mul( const void* lhs, size_t lhs_height, size_t lhs_width, // const void* rhs, size_t rhs_height, size_t rhs_width); @@ -120,7 +121,7 @@ std::vector mul( /// @param[in] rhs_width RHS width. /// /// @return The result matrix. -std::vector div( +Buffer div( const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width, // const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width); diff --git a/test/reference/cast.cpp b/test/reference/cast.cpp index 95a63a32..e11cb350 100644 --- a/test/reference/cast.cpp +++ b/test/reference/cast.cpp @@ -12,6 +12,7 @@ #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" #include "test/common/float16.hpp" #include "test/common/memory.hpp" @@ -20,8 +21,8 @@ namespace kai::test { template -std::vector cast(const void* src, size_t length) { - std::vector dst(round_up_division(length * size_in_bits, 8)); +Buffer cast(const void* src, size_t length) { + Buffer dst(round_up_division(length * size_in_bits, 8)); for (size_t i = 0; i < length; ++i) { write_array(dst.data(), i, static_cast(read_array(src, i))); @@ -30,11 +31,11 @@ std::vector cast(const void* src, size_t length) { return dst; } -template std::vector cast(const void* src, size_t length); -template std::vector cast(const void* src, size_t length); -template std::vector cast(const void* src, size_t length); +template Buffer cast(const void* src, size_t length); +template Buffer cast(const void* src, size_t length); +template Buffer cast(const void* src, size_t length); -std::vector cast(const void* src, kai::test::DataType src_dt, DataType dst_dt, size_t height, size_t width) { +Buffer cast(const void* src, kai::test::DataType src_dt, DataType dst_dt, size_t height, size_t width) { const auto length = height * width; if (src_dt == DataType::BF16 && dst_dt == DataType::FP32) { @@ -46,8 +47,8 @@ std::vector cast(const void* src, kai::test::DataType src_dt, DataType KAI_ERROR("Unsupported cast data type!"); } -std::vector cast_qsu4_qsi4(const void* src, size_t length) { - std::vector dst(round_up_division(length, 2)); +Buffer cast_qsu4_qsi4(const void* src, size_t length) { + Buffer dst(round_up_division(length, 2)); for (size_t i = 0; i < length; ++i) { write_array(dst.data(), i, static_cast(static_cast(read_array(src, i)) + 8)); diff --git a/test/reference/cast.hpp b/test/reference/cast.hpp index 744d5a3c..8dc09b22 100644 --- a/test/reference/cast.hpp +++ b/test/reference/cast.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -24,7 +25,7 @@ namespace kai::test { /// /// @return A new data buffer containing casted values. template -std::vector cast(const void* src, size_t length); +Buffer cast(const void* src, size_t length); /// Converts each element of the source matrix to the new data type. /// @@ -35,7 +36,7 @@ std::vector cast(const void* src, size_t length); /// @param[in] width Number of columns. /// /// @return The result matrix containing data in the destination data type. -std::vector cast(const void* src, DataType src_dt, DataType dst_dt, size_t height, size_t width); +Buffer cast(const void* src, DataType src_dt, DataType dst_dt, size_t height, size_t width); /// Converts each element of the source data from 4-bit signed symmetric quantized /// to 4-bit unsigned symmetric quantized. @@ -44,6 +45,6 @@ std::vector cast(const void* src, DataType src_dt, DataType dst_dt, siz /// @param[in] length The number of elements. /// /// @return A new data buffer with converted values. -std::vector cast_qsu4_qsi4(const void* src, size_t length); +Buffer cast_qsu4_qsi4(const void* src, size_t length); } // namespace kai::test diff --git a/test/reference/clamp.cpp b/test/reference/clamp.cpp index ab2e77e9..eadc755e 100644 --- a/test/reference/clamp.cpp +++ b/test/reference/clamp.cpp @@ -12,6 +12,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/float16.hpp" #include "test/common/memory.hpp" #include "test/common/numeric_limits.hpp" @@ -64,8 +65,8 @@ std::tuple find_clamp_range(DataType type, const void* src, size_t } template -std::vector clamp(const void* src, size_t len, T min_value, T max_value) { - std::vector dst(round_up_division(len * size_in_bits, 8)); +Buffer clamp(const void* src, size_t len, T min_value, T max_value) { + Buffer dst(round_up_division(len * size_in_bits, 8)); for (size_t i = 0; i < len; ++i) { write_array(dst.data(), i, std::clamp(read_array(src, i), min_value, max_value)); @@ -74,11 +75,11 @@ std::vector clamp(const void* src, size_t len, T min_value, T max_value return dst; } -template std::vector clamp(const void* src, size_t len, float min_value, float max_value); -template std::vector clamp(const void* src, size_t len, Float16 min_value, Float16 max_value); +template Buffer clamp(const void* src, size_t len, float min_value, float max_value); +template Buffer clamp(const void* src, size_t len, Float16 min_value, Float16 max_value); -std::vector clamp(DataType type, const void* src, size_t len, float min_value, float max_value) { - std::vector dst(round_up_division(len * data_type_size_in_bits(type), 8)); +Buffer clamp(DataType type, const void* src, size_t len, float min_value, float max_value) { + Buffer dst(round_up_division(len * data_type_size_in_bits(type), 8)); for (size_t i = 0; i < len; ++i) { write_array(type, dst.data(), i, std::clamp(read_array(type, src, i), min_value, max_value)); diff --git a/test/reference/clamp.hpp b/test/reference/clamp.hpp index 532e7d25..52ca57ba 100644 --- a/test/reference/clamp.hpp +++ b/test/reference/clamp.hpp @@ -11,6 +11,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -42,7 +43,7 @@ std::tuple find_clamp_range(DataType type, const void* src, size_t /// @param[in] min_value Lower bound of clamp. /// @param[in] width Upper bound of clamp. template -std::vector clamp(const void* src, size_t len, T min_value, T max_value); +Buffer clamp(const void* src, size_t len, T min_value, T max_value); /// Clamps the matrix. /// @@ -51,5 +52,5 @@ std::vector clamp(const void* src, size_t len, T min_value, T max_value /// @param[in] len Number of values in the source matrix. /// @param[in] min_value Lower bound of clamp. /// @param[in] max_value Upper bound of clamp. -std::vector clamp(DataType type, const void* src, size_t len, float min_value, float max_value); +Buffer clamp(DataType type, const void* src, size_t len, float min_value, float max_value); } // namespace kai::test diff --git a/test/reference/fill.cpp b/test/reference/fill.cpp index 459055a1..82068fdb 100644 --- a/test/reference/fill.cpp +++ b/test/reference/fill.cpp @@ -15,6 +15,7 @@ #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" +#include "test/common/buffer.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" #include "test/common/float16.hpp" @@ -26,12 +27,11 @@ namespace kai::test { namespace { template -std::vector fill_matrix_raw(size_t height, size_t width, std::function gen) { +Buffer fill_matrix_raw(size_t height, size_t width, std::function gen) { const auto size = height * width * size_in_bits / 8; KAI_ASSUME(width * size_in_bits % 8 == 0); - std::vector data; - data.resize(size); + Buffer data(size); auto ptr = reinterpret_cast(data.data()); for (size_t y = 0; y < height; ++y) { @@ -44,7 +44,7 @@ std::vector fill_matrix_raw(size_t height, size_t width, std::function< } template -std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { +Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { using TDist = std::conditional_t< std::is_floating_point_v, std::uniform_real_distribution, std::uniform_int_distribution>; @@ -55,7 +55,7 @@ std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_ } template <> -std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { +Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { std::mt19937 rnd(seed); std::uniform_real_distribution dist; @@ -63,7 +63,7 @@ std::vector fill_matrix_random_raw(size_t height, size_t width } template <> -std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { +Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { std::mt19937 rnd(seed); std::uniform_real_distribution dist; @@ -71,7 +71,7 @@ std::vector fill_matrix_random_raw(size_t height, size_t widt } template <> -std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { +Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { std::mt19937 rnd(seed); std::uniform_int_distribution dist(-8, 7); @@ -79,7 +79,7 @@ std::vector fill_matrix_random_raw(size_t height, size_t width, u } template <> -std::vector fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { +Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { std::mt19937 rnd(seed); std::uniform_int_distribution dist(0, 15); @@ -88,7 +88,7 @@ std::vector fill_matrix_random_raw(size_t height, size_t width, } // namespace -std::vector fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed) { +Buffer fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed) { switch (format.pack_format()) { case DataFormat::PackFormat::NONE: switch (format.data_type()) { @@ -119,11 +119,11 @@ std::vector fill_matrix_random(size_t height, size_t width, const DataF } template -std::vector fill_random(size_t length, uint32_t seed) { +Buffer fill_random(size_t length, uint32_t seed) { return fill_matrix_random_raw(1, length, seed); } -template std::vector fill_random(size_t length, uint32_t seed); -template std::vector fill_random(size_t length, uint32_t seed); +template Buffer fill_random(size_t length, uint32_t seed); +template Buffer fill_random(size_t length, uint32_t seed); } // namespace kai::test diff --git a/test/reference/fill.hpp b/test/reference/fill.hpp index 25846c6c..9dd0f26c 100644 --- a/test/reference/fill.hpp +++ b/test/reference/fill.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { class DataFormat; @@ -22,7 +24,7 @@ class DataFormat; /// @param[in] seed Random seed. /// /// @return The data buffer for the matrix. -std::vector fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed); +Buffer fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed); /// Creates a new data buffer filled with random data. /// @@ -33,6 +35,6 @@ std::vector fill_matrix_random(size_t height, size_t width, const DataF /// /// @return The data buffer. template -std::vector fill_random(size_t length, uint32_t seed); +Buffer fill_random(size_t length, uint32_t seed); } // namespace kai::test diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index b1378c75..be5bb6a6 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -12,6 +12,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" #include "test/common/float16.hpp" @@ -42,7 +43,7 @@ namespace { /// /// @return The result data buffer. template -std::vector matmul_any_type( +Buffer matmul_any_type( const void* lhs, const void* rhs, // size_t m, size_t n, size_t k, // bool lhs_transposed, bool rhs_transposed) { @@ -52,8 +53,7 @@ std::vector matmul_any_type( const auto rhs_n_stride = rhs_transposed ? k : 1; const auto rhs_k_stride = rhs_transposed ? 1 : n; - std::vector dst; - dst.resize(m * n * size_in_bits / 8); + Buffer dst(m * n * size_in_bits / 8); KAI_ASSUME(n * size_in_bits % 8 == 0); for (size_t im = 0; im < m; ++im) { @@ -75,7 +75,7 @@ std::vector matmul_any_type( } // namespace -std::vector matmul_pack_rhs( +Buffer matmul_pack_rhs( const void* data, const void* scales, const void* zero_points, const DataFormat& src_format, const DataFormat& dst_format, size_t n, size_t k, bool transposing) { const auto src_dt = src_format.data_type(); @@ -84,9 +84,9 @@ std::vector matmul_pack_rhs( const auto dst_dt = dst_format.data_type(); const auto dst_pf = dst_format.pack_format(); - std::vector tmp_data; - std::vector tmp_scales; - std::vector tmp_zero_points; + Buffer tmp_data; + Buffer tmp_scales; + Buffer tmp_zero_points; if (transposing) { tmp_data = transpose(data, src_dt, k, n); @@ -125,7 +125,7 @@ std::vector matmul_pack_rhs( return pack(dst_format, data, scales, zero_points, src_format, n, k); } -std::vector matmul( +Buffer matmul( const void* lhs, [[maybe_unused]] const void* lhs_scales, [[maybe_unused]] const void* lhs_zero_points, DataType lhs_dt, // const void* rhs, [[maybe_unused]] const void* rhs_scales, [[maybe_unused]] const void* rhs_zero_points, @@ -140,10 +140,10 @@ std::vector matmul( const auto rhs_h = rhs_transposed ? n : k; const auto rhs_w = rhs_transposed ? k : n; - std::vector tmp_lhs; - std::vector tmp_rhs; - std::vector tmp_dst; - std::vector tmp_bias; + Buffer tmp_lhs; + Buffer tmp_rhs; + Buffer tmp_dst; + Buffer tmp_bias; if (lhs_dt != dst_dt) { tmp_lhs = cast(lhs, lhs_dt, dst_dt, lhs_h, lhs_w); @@ -184,7 +184,7 @@ std::vector matmul( return tmp_dst; } -std::vector indirect_matmul( +Buffer indirect_matmul( const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // @@ -196,7 +196,7 @@ std::vector indirect_matmul( // This is inefficient, but allows code-reuse const size_t chunk_bytes = k_chunk_length * round_up_division(data_type_size_in_bits(lhs_dt), 8); const size_t n_chunks = m * k_chunk_count; - std::vector lhs(n_chunks * chunk_bytes); + Buffer lhs(n_chunks * chunk_bytes); // Copy all chunks to the created matrix for (size_t i = 0; i < n_chunks; i += 1) { @@ -217,7 +217,7 @@ std::vector indirect_matmul( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> -std::vector indirect_matmul_nt_t_quantized( +Buffer indirect_matmul_nt_t_quantized( size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length, // const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, @@ -228,7 +228,7 @@ std::vector indirect_matmul_nt_t_quantized( const auto lhs_num_quant_per_row = round_up_division(k_chunk_count * k_chunk_length, lhs_quant_width); const auto rhs_num_quant_per_row = round_up_division(k_chunk_count * k_chunk_length, rhs_quant_width); - std::vector dst(m * n * sizeof(DstData)); + Buffer dst(m * n * sizeof(DstData)); for (size_t i_m = 0; i_m < m; ++i_m) { for (size_t i_n = 0; i_n < n; ++i_n) { @@ -293,7 +293,7 @@ std::vector indirect_matmul_nt_t_quantized( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> -std::vector matmul_nt_t_quantized( +Buffer matmul_nt_t_quantized( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, // size_t lhs_quant_height, size_t lhs_quant_width, // @@ -304,7 +304,7 @@ std::vector matmul_nt_t_quantized( const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width); const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width); - std::vector dst(m * n * sizeof(DstData)); + Buffer dst(m * n * sizeof(DstData)); for (size_t row = 0; row < m; ++row) { for (size_t col = 0; col < n; ++col) { @@ -355,8 +355,7 @@ std::vector matmul_nt_t_quantized( return dst; } -template std::vector -matmul_nt_t_quantized( +template Buffer matmul_nt_t_quantized( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, size_t lhs_quant_width, // @@ -364,8 +363,7 @@ matmul_nt_t_quantized -matmul_nt_t_quantized( +template Buffer matmul_nt_t_quantized( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, size_t lhs_quant_width, // @@ -373,8 +371,7 @@ matmul_nt_t_quantized -matmul_nt_t_quantized( +template Buffer matmul_nt_t_quantized( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, size_t lhs_quant_width, // @@ -382,7 +379,7 @@ matmul_nt_t_quantized +template Buffer indirect_matmul_nt_t_quantized( size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length, // const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales, @@ -395,7 +392,7 @@ indirect_matmul_nt_t_quantized -std::vector matmul_clamp_nt_t( +Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // @@ -404,7 +401,7 @@ std::vector matmul_clamp_nt_t( const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width); const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width); - std::vector dst(m * n * sizeof(DstData)); + Buffer dst(m * n * sizeof(DstData)); const auto* lhs_scales_ptr = reinterpret_cast(lhs_scales); const auto* rhs_scales_ptr = reinterpret_cast(rhs_scales); @@ -448,29 +445,28 @@ std::vector matmul_clamp_nt_t( return dst; } -template std::vector matmul_clamp_nt_t( +template Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector -matmul_clamp_nt_t( +template Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector matmul_clamp_nt_t( +template Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector matmul_clamp_nt_t( +template Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // @@ -480,7 +476,7 @@ template std::vector matmul_clamp_nt_t -std::vector matmul_clamp_nt_nt( +Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // @@ -489,7 +485,7 @@ std::vector matmul_clamp_nt_nt( const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width); const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width); - std::vector dst(m * n * sizeof(DstData)); + Buffer dst(m * n * sizeof(DstData)); const auto* lhs_scales_ptr = reinterpret_cast(lhs_scales); const auto* rhs_scales_ptr = reinterpret_cast(rhs_scales); @@ -533,29 +529,27 @@ std::vector matmul_clamp_nt_nt( return dst; } -template std::vector matmul_clamp_nt_nt( +template Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector matmul_clamp_nt_nt( +template Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector -matmul_clamp_nt_nt( +template Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // const void* biases, // float min_value, float max_value); -template std::vector -matmul_clamp_nt_nt( +template Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp index 343b8d34..d349d0b4 100644 --- a/test/reference/matmul.hpp +++ b/test/reference/matmul.hpp @@ -10,6 +10,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -28,7 +29,7 @@ class DataFormat; /// @param[in] transposing Perform transpose then pack. /// /// @return The packed RHS matrix. -std::vector matmul_pack_rhs( +Buffer matmul_pack_rhs( const void* data, const void* scales, const void* zero_points, const DataFormat& src_format, const DataFormat& dst_format, size_t n, size_t k, bool transposing); @@ -57,7 +58,7 @@ std::vector matmul_pack_rhs( /// @param[in] rhs_transposed `true` if RHS operand is transposed. /// /// @return The result data buffer. -std::vector matmul( +Buffer matmul( const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // const void* bias, const void* bias_scales, const void* bias_zero_points, DataType bias_dt, // @@ -88,7 +89,7 @@ std::vector matmul( /// @param[in] k_chunk_size Number of elements in each LHS K chunk /// /// @return The result data buffer. -std::vector indirect_matmul( +Buffer indirect_matmul( const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt, // const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt, // @@ -129,7 +130,7 @@ std::vector indirect_matmul( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData> -std::vector matmul_clamp_nt_t( +Buffer matmul_clamp_nt_t( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // @@ -174,7 +175,7 @@ std::vector matmul_clamp_nt_t( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData> -std::vector matmul_clamp_nt_nt( +Buffer matmul_clamp_nt_nt( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width, // const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width, // @@ -184,7 +185,7 @@ std::vector matmul_clamp_nt_nt( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> -std::vector matmul_nt_t_quantized( +Buffer matmul_nt_t_quantized( size_t m, size_t n, size_t k, // const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, size_t lhs_quant_width, // @@ -195,7 +196,7 @@ std::vector matmul_nt_t_quantized( template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> -std::vector indirect_matmul_nt_t_quantized( +Buffer indirect_matmul_nt_t_quantized( size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length, // const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, diff --git a/test/reference/matmul_pack.cpp b/test/reference/matmul_pack.cpp index 55e916a0..40139e09 100644 --- a/test/reference/matmul_pack.cpp +++ b/test/reference/matmul_pack.cpp @@ -10,16 +10,18 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/round.hpp" #include "test/reference/binary_elementwise.hpp" #include "test/reference/pack.hpp" +#include "test/reference/pad.hpp" #include "test/reference/reduce.hpp" #include "test/reference/reorder.hpp" namespace kai::test { template -std::vector matmul_pack_rhs_nxk_static_quantized( +Buffer matmul_pack_rhs_nxk_static_quantized( const void* data, const void* scales, Scale lhs_scale, Scale dst_scale, const void* biases, ZeroPoint lhs_zero_point, size_t n, size_t k, size_t block_height, size_t block_width) { // The RHS data matrix is reordered according to the blocking parameters. @@ -29,7 +31,8 @@ std::vector matmul_pack_rhs_nxk_static_quantized( // final_scales[n_index] = lhs_scale * rhs_scales[n_index] / dst_scale. const auto scale_multiplier = lhs_scale / dst_scale; auto combined_scales = mul(scales, 1, n, &scale_multiplier, 1, 1); - combined_scales.resize(round_up_multiple(n, block_height) * sizeof(Scale)); // Pads with 0s. + combined_scales = pad_matrix( + combined_scales.data(), 1, n, 0, 0, round_up_multiple(n, block_height) - n, 0, 0); // Pads with 0s. // The effective per-channel biases: // final_biases[n_index] = biases[n_index] - lhs_zero_point * sum(data[n_index, :]). @@ -37,7 +40,8 @@ std::vector matmul_pack_rhs_nxk_static_quantized( // Reduced across width earlier, so lhs width is now 1 const auto row_sum_times_lhs_zp = mul(row_sum_reduced.data(), n, 1, &lhs_zero_point, 1, 1); auto combined_biases = sub(biases, 1, n, row_sum_times_lhs_zp.data(), 1, n); - combined_biases.resize(round_up_multiple(n, block_height) * sizeof(ZeroPoint)); // Pads with 0s. + combined_biases = pad_matrix( + combined_biases.data(), 1, n, 0, 0, round_up_multiple(n, block_height) - n, 0, 0); // Pads with 0s. // Packs the effective biases followed by the data block followed by the effective scales for the block. auto packed_rhs = pack_zero_points_data_scales_per_block( @@ -47,7 +51,7 @@ std::vector matmul_pack_rhs_nxk_static_quantized( return packed_rhs; } -template std::vector matmul_pack_rhs_nxk_static_quantized( +template Buffer matmul_pack_rhs_nxk_static_quantized( const void* data, const void* scales, float lhs_scale, float dst_scale, const void* biases, int32_t lhs_zero_point, size_t n, size_t k, size_t block_height, size_t block_width); diff --git a/test/reference/matmul_pack.hpp b/test/reference/matmul_pack.hpp index 30646c98..ea713dd7 100644 --- a/test/reference/matmul_pack.hpp +++ b/test/reference/matmul_pack.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { /// Packs the RHS buffer for static quantized GeMM. @@ -37,7 +39,7 @@ namespace kai::test { /// /// @return The packed RHS. template -std::vector matmul_pack_rhs_nxk_static_quantized( +Buffer matmul_pack_rhs_nxk_static_quantized( const void* data, const void* scales, Scale lhs_scale, Scale dst_scale, const void* biases, ZeroPoint lhs_zero_point, size_t n, size_t k, size_t block_height, size_t block_width); diff --git a/test/reference/pack.cpp b/test/reference/pack.cpp index 055026f7..e06e3303 100644 --- a/test/reference/pack.cpp +++ b/test/reference/pack.cpp @@ -16,6 +16,7 @@ #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" +#include "test/common/buffer.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" #include "test/common/float16.hpp" @@ -39,13 +40,13 @@ BFloat16 convert(const uint8_t* src_ptr_elm, DataType src_dtype, DataType dst_dt } } -std::vector pack_block( +Buffer pack_block( const void* src, DataType src_dtype, DataType dst_dtype, size_t src_esize, size_t dst_esize, size_t full_height, size_t full_width, size_t block_height, size_t block_width, size_t subblock_height, size_t subblock_width) { const auto dst_bytes = round_up_multiple(full_height, block_height) * round_up_multiple(full_width, block_width) * dst_esize; - std::vector dst(dst_bytes, 0); + Buffer dst(dst_bytes, 0); const auto* src_ptr = reinterpret_cast(src); auto* dst_ptr = dst.data(); @@ -97,7 +98,7 @@ std::vector pack_block( } /// Packs the matrix from raw to per-row bias format. -std::vector pack_bias_per_row( +Buffer pack_bias_per_row( DataType src_dtype, DataType bias_dtype, DataType dst_dtype, size_t src_esize, size_t bias_esize, size_t dst_esize, const void* src, const void* bias, size_t height, size_t width, size_t block_height, size_t block_width, size_t subblock_height, size_t subblock_width) { @@ -110,7 +111,7 @@ std::vector pack_bias_per_row( const auto group_bytes = group_bias_bytes + group_num_blocks * block_data_bytes; const auto dst_bytes = num_groups * group_bytes; - std::vector dst(dst_bytes, 0); + Buffer dst(dst_bytes, 0); const auto* src_ptr = reinterpret_cast(src); const auto* bias_ptr = reinterpret_cast(bias); @@ -170,7 +171,7 @@ std::vector pack_bias_per_row( } // namespace -std::vector pack( +Buffer pack( const DataFormat& dst_format, const void* src, [[maybe_unused]] const void* scales, const void* bias, const DataFormat& src_format, size_t height, size_t width) { const auto dst_dt = dst_format.data_type(); @@ -219,8 +220,7 @@ std::vector pack( } template -std::vector pack_data_scales( - const void* data, const void* scales, size_t height, size_t width, size_t quant_width) { +Buffer pack_data_scales(const void* data, const void* scales, size_t height, size_t width, size_t quant_width) { KAI_ASSUME_IF(size_in_bits < 8, quant_width % (8 / size_in_bits) == 0); KAI_ASSUME_IF(size_in_bits < 8, width % (8 / size_in_bits) == 0); @@ -229,7 +229,7 @@ std::vector pack_data_scales( const auto data_bytes = height * width * size_in_bits / 8; const auto scales_bytes = height * num_quant_packets_x * sizeof(Scale); - std::vector dst(data_bytes + scales_bytes); + Buffer dst(data_bytes + scales_bytes); const auto* scales_ptr = reinterpret_cast(scales); auto* dst_ptr = dst.data(); @@ -251,13 +251,13 @@ std::vector pack_data_scales( } } - KAI_ASSERT(dst_ptr == &*dst.end()); + KAI_ASSERT(dst_ptr == dst.data() + dst.size()); return dst; } template -std::vector pack_zero_points_data_scales_per_block( +Buffer pack_zero_points_data_scales_per_block( const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points, size_t block_num_data, size_t block_num_scales) { // Only data is allowed to be sub-byte. @@ -272,7 +272,7 @@ std::vector pack_zero_points_data_scales_per_block( KAI_ASSUME( (block_num_data * size_in_bits + block_num_scales * size_in_bits) % size_in_bits == 0); - std::vector dst(round_up_division( + Buffer dst(round_up_division( num_blocks * (block_num_zero_points * size_in_bits + block_num_data * size_in_bits + block_num_scales * size_in_bits), @@ -297,17 +297,17 @@ std::vector pack_zero_points_data_scales_per_block( dst_ptr += block_num_scales * sizeof(Scale); } - KAI_ASSERT(dst_ptr == &*dst.end()); + KAI_ASSERT(dst_ptr == dst.data() + dst.size()); return dst; } -template std::vector pack_zero_points_data_scales_per_block( +template Buffer pack_zero_points_data_scales_per_block( const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points, size_t block_num_data, size_t block_num_scales); template -std::vector pack_data_scales_interleave_block( +Buffer pack_data_scales_interleave_block( const void* data, const void* scales, size_t height, size_t width, size_t quant_width) { KAI_ASSUME_IF(size_in_bits < 8, quant_width % (8 / size_in_bits) == 0); KAI_ASSUME_IF(size_in_bits < 8, width % (8 / size_in_bits) == 0); @@ -319,7 +319,7 @@ std::vector pack_data_scales_interleave_block( const auto data_bytes = height * width * size_in_bits / 8; const auto scales_bytes = scales != nullptr ? height * num_quant_packets_x * sizeof(Scale) : 0; - std::vector dst(data_bytes + scales_bytes); + Buffer dst(data_bytes + scales_bytes); const auto* scales_ptr = reinterpret_cast(scales); auto* dst_ptr = dst.data(); @@ -341,18 +341,18 @@ std::vector pack_data_scales_interleave_block( } } - KAI_ASSERT(dst_ptr == &*dst.end()); + KAI_ASSERT(dst_ptr == dst.data() + dst.size()); return dst; } -template std::vector pack_data_scales_interleave_block( +template Buffer pack_data_scales_interleave_block( const void* data, const void* scales, size_t height, size_t width, size_t quant_width); -template std::vector pack_data_scales_interleave_block( +template Buffer pack_data_scales_interleave_block( const void* data, const void* scales, size_t height, size_t width, size_t quant_width); template -std::vector pack_block_data_zero_points_scale_bias( +Buffer pack_block_data_zero_points_scale_bias( const void* data, const void* zero_points, const void* scales, const void* biases, size_t height, size_t width, size_t quant_height, size_t quant_width, size_t block_height, size_t block_width, size_t interleave_x_blocks) { if (quant_width == width) { @@ -382,7 +382,7 @@ std::vector pack_block_data_zero_points_scale_bias( const auto biases_bytes = has_biases ? height * sizeof(Bias) : 0; const auto dst_bytes = num_quant_packets_y * num_quant_packets_x * quant_packet_bytes + biases_bytes; - std::vector dst(dst_bytes); + Buffer dst(dst_bytes); const auto* zero_points_ptr = reinterpret_cast(zero_points); const auto* scales_ptr = reinterpret_cast(scales); @@ -445,7 +445,7 @@ std::vector pack_block_data_zero_points_scale_bias( } } - KAI_ASSERT(dst_ptr == &*dst.end()); + KAI_ASSERT(dst_ptr == dst.data() + dst.size()); return dst; } diff --git a/test/reference/pack.hpp b/test/reference/pack.hpp index 63c94d58..b424d5ba 100644 --- a/test/reference/pack.hpp +++ b/test/reference/pack.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { class DataFormat; @@ -21,7 +23,7 @@ class DataFormat; /// @param[in] src_format Data format of the source matrix. /// @param[in] height Number of rows of the source matrix. /// @param[in] width Number of columns of the source matrix. -std::vector pack( +Buffer pack( const DataFormat& dst_format, const void* src, const void* scales, const void* bias, const DataFormat& src_format, size_t height, size_t width); @@ -76,8 +78,7 @@ std::vector pack( /// /// @return The packed data buffer. template -std::vector pack_data_scales( - const void* data, const void* scales, size_t height, size_t width, size_t quant_width); +Buffer pack_data_scales(const void* data, const void* scales, size_t height, size_t width, size_t quant_width); /// Packs the zero point, data and scale into a single buffer. /// @@ -139,7 +140,7 @@ std::vector pack_data_scales( /// /// @return The packed data buffer. template -std::vector pack_zero_points_data_scales_per_block( +Buffer pack_zero_points_data_scales_per_block( const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points, size_t block_num_data, size_t block_num_scales); @@ -197,7 +198,7 @@ std::vector pack_zero_points_data_scales_per_block( /// /// @return The packed data buffer. template -std::vector pack_data_scales_interleave_block( +Buffer pack_data_scales_interleave_block( const void* data, const void* scales, size_t height, size_t width, size_t quant_width); /// Packs the quantized data with two halves of a block interleaved. @@ -235,7 +236,7 @@ std::vector pack_data_scales_interleave_block( /// /// @return The packed data buffer. template -std::vector pack_data_interleave_block(const void* data, size_t height, size_t width, size_t block_width) { +Buffer pack_data_interleave_block(const void* data, size_t height, size_t width, size_t block_width) { return pack_data_scales_interleave_block(data, nullptr, height, width, block_width); } diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp index 5b6f2b8d..182f6dc8 100644 --- a/test/reference/pad.cpp +++ b/test/reference/pad.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -12,16 +12,18 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" #include "test/common/memory.hpp" +#include "test/common/round.hpp" namespace kai::test { template -std::vector pad_row( +Buffer pad_row( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size, const uint8_t val) { - std::vector output(dst_size, val); + Buffer output(dst_size, val); for (size_t y = 0; y < height; ++y) { for (size_t x = 0; x < width; ++x) { @@ -31,11 +33,43 @@ std::vector pad_row( } return output; } -template std::vector pad_row( +template Buffer pad_row( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size, const uint8_t val); -template std::vector pad_row( +template Buffer pad_row( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size, const uint8_t val); + +template +Buffer pad_matrix( + const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom, + T pad_value) { + const size_t dst_height = height + pad_top + pad_bottom; + const size_t dst_width = width + pad_left + pad_right; + const size_t dst_size = round_up_multiple(dst_height * dst_width * size_in_bits, 8); + + Buffer dst(dst_size); + + for (size_t y = 0; y < dst_height; ++y) { + for (size_t x = 0; x < dst_width; ++x) { + if (y >= pad_top && y < pad_top + height && x >= pad_left && x < pad_left + width) { + const T value = read_array(data, (y - pad_top) * width + x - pad_left); + write_array(dst.data(), y * dst_width + x, value); + } else { + write_array(dst.data(), y * dst_width + x, pad_value); + } + } + } + + return dst; +} + +template Buffer pad_matrix( + const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom, + float pad_value); +template Buffer pad_matrix( + const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom, + int32_t pad_value); + } // namespace kai::test diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp index 2f46639d..74c0229a 100644 --- a/test/reference/pad.hpp +++ b/test/reference/pad.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -28,8 +29,25 @@ namespace kai::test { /// @return The padded matrix. /// template -std::vector pad_row( +Buffer pad_row( const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size, uint8_t val = 0); +/// Pads the matrix with value. +/// +/// @param[in] data The input data buffer. +/// @param[in] height The number of input rows. +/// @param[in] width The number of input columns. +/// @param[in] pad_left The number of element padded to the left. +/// @param[in] pad_top The number of element padded to the top. +/// @param[in] pad_right The number of element padded to the right. +/// @param[in] pad_bottom The number of element padded to the bottom. +/// @param[in] pad_value The padding value. +/// +/// @return The padded matrix. +template +Buffer pad_matrix( + const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom, + T pad_value); + } // namespace kai::test diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index 477a13c2..c64d8404 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -14,6 +14,7 @@ #include #include "test/common/bfloat16.hpp" +#include "test/common/buffer.hpp" #include "test/common/int4.hpp" #include "test/common/memory.hpp" #include "test/common/numeric_limits.hpp" @@ -77,8 +78,7 @@ IntType quantize_asymmetric(FloatType value, FloatType scale, ZeroPointType zero template int8_t quantize_asymmetric(float value, float scale, int32_t zero_point); template -std::vector compute_symmetric_per_block_quantization_info( - const void* src, size_t height, size_t width, size_t quant_width) { +Buffer compute_symmetric_per_block_quantization_info(const void* src, size_t height, size_t width, size_t quant_width) { static_assert(is_floating_point); static_assert(is_integral); static_assert(is_floating_point); @@ -88,7 +88,7 @@ std::vector compute_symmetric_per_block_quantization_info( const auto num_quant_packets_x = round_up_division(width, quant_width); const auto scales_bytes = height * num_quant_packets_x * sizeof(ScaleType); - std::vector scales(scales_bytes); + Buffer scales(scales_bytes); const auto* src_ptr = reinterpret_cast(src); @@ -117,7 +117,7 @@ std::vector compute_symmetric_per_block_quantization_info( } template -std::vector quantize_symmetric_per_block( +Buffer quantize_symmetric_per_block( const void* src, const void* scales, size_t height, size_t width, size_t quant_width) { static_assert(is_floating_point); static_assert(is_integral); @@ -126,7 +126,7 @@ std::vector quantize_symmetric_per_block( const auto num_quant_packets_x = round_up_division(width, quant_width); const auto data_bytes = round_up_division(height * width * size_in_bits, 8); - std::vector data(data_bytes); + Buffer data(data_bytes); const auto* src_ptr = reinterpret_cast(src); @@ -148,11 +148,11 @@ std::vector quantize_symmetric_per_block( return data; } -template std::vector quantize_symmetric_per_block( +template Buffer quantize_symmetric_per_block( const void* src, const void* scales, size_t height, size_t width, size_t quant_width); template -std::tuple, std::vector> quantize_symmetric_per_block_dynamic( +std::tuple quantize_symmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width) { auto scales_src_type = compute_symmetric_per_block_quantization_info(src, height, width, quant_width); @@ -160,30 +160,30 @@ std::tuple, std::vector> quantize_symmetric_per_bl src, scales_src_type.data(), height, width, quant_width); if constexpr (std::is_same_v) { - return {data, scales_src_type}; + return {std::move(data), std::move(scales_src_type)}; } else { auto scales = cast(scales_src_type.data(), scales_src_type.size() * 8 / size_in_bits); - return {data, scales}; + return {std::move(data), std::move(scales)}; } } -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, Int4, Float16>(const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, Int4, float>(const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, Int4, BFloat16>(const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, int8_t, Float16>(const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, int8_t, float>(const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector> quantize_symmetric_per_block_dynamic< - float, int32_t, float>(const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); +template std::tuple quantize_symmetric_per_block_dynamic( + const void* src, size_t height, size_t width, size_t quant_width); template -std::tuple, std::vector> compute_asymmetric_per_block_quantization_info( +std::tuple compute_asymmetric_per_block_quantization_info( const void* src, size_t height, size_t width, size_t quant_width) { static_assert(is_floating_point); static_assert(is_integral); @@ -195,10 +195,10 @@ std::tuple, std::vector> compute_asymmetric_per_bl const auto num_quant_packets_x = round_up_division(width, quant_width); const auto scales_bytes = height * num_quant_packets_x * sizeof(ScaleType); - std::vector scales(scales_bytes); + Buffer scales(scales_bytes); const auto zero_points_bytes = height * num_quant_packets_x * sizeof(ZeroPointType); - std::vector zero_points(zero_points_bytes); + Buffer zero_points(zero_points_bytes); for (size_t y = 0; y < height; ++y) { for (size_t x_quant = 0; x_quant < width; x_quant += quant_width) { @@ -226,11 +226,11 @@ std::tuple, std::vector> compute_asymmetric_per_bl } } - return {scales, zero_points}; + return {std::move(scales), std::move(zero_points)}; } template -std::vector quantize_asymmetric_per_block( +Buffer quantize_asymmetric_per_block( const void* src, const void* scales, const void* zero_points, size_t height, size_t width, size_t quant_width) { static_assert(is_floating_point); static_assert(is_integral); @@ -240,7 +240,7 @@ std::vector quantize_asymmetric_per_block( const auto num_quant_packets_x = round_up_division(width, quant_width); const auto data_bytes = round_up_division(height * width * size_in_bits, 8); - std::vector data(data_bytes); + Buffer data(data_bytes); for (size_t y = 0; y < height; ++y) { for (size_t x_quant = 0; x_quant < width; x_quant += quant_width) { @@ -267,7 +267,7 @@ std::vector quantize_asymmetric_per_block( } template -std::tuple, std::vector, std::vector> quantize_asymmetric_per_block_dynamic( +std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width) { /* Calculate the asymmetric quantization information, one scaling per row */ auto [scales_src_type, zero_points] = @@ -279,22 +279,19 @@ std::tuple, std::vector, std::vector> qua src, scales_src_type.data(), zero_points.data(), height, width, quant_width); if constexpr (std::is_same_v) { - return {data, scales_src_type, zero_points}; + return {std::move(data), std::move(scales_src_type), std::move(zero_points)}; } else { auto scales = cast(scales_src_type.data(), scales_src_type.size() * 8 / size_in_bits); - return {data, scales, zero_points}; + return {std::move(data), std::move(scales), std::move(zero_points)}; } } -template std::tuple, std::vector, std::vector> -quantize_asymmetric_per_block_dynamic( +template std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector, std::vector> -quantize_asymmetric_per_block_dynamic( +template std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); -template std::tuple, std::vector, std::vector> -quantize_asymmetric_per_block_dynamic( +template std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); } // namespace kai::test diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index 3e2f162d..e0c7d1f0 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -11,6 +11,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { /// Quantization method. @@ -90,8 +92,7 @@ IntType quantize_asymmetric(FloatType value, FloatType scale, ZeroPointType zero /// /// @return The quantization scale matrix. template -std::vector compute_symmetric_per_block_quantization_info( - const void* src, size_t height, size_t width, size_t quant_width); +Buffer compute_symmetric_per_block_quantization_info(const void* src, size_t height, size_t width, size_t quant_width); /// Quantizes each block of the matrix using symmetric quantization method. /// @@ -158,7 +159,7 @@ std::vector compute_symmetric_per_block_quantization_info( /// /// @return The quantized data matrix. template -std::vector quantize_symmetric_per_block( +Buffer quantize_symmetric_per_block( const void* src, const void* scales, size_t height, size_t width, size_t quant_width); /// Dynamically quantizes each block of the matrix using symmetric quantization method. @@ -184,7 +185,7 @@ std::vector quantize_symmetric_per_block( /// /// @return The quantized data matrix and the quantization scale matrix. template -std::tuple, std::vector> quantize_symmetric_per_block_dynamic( +std::tuple quantize_symmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); /// Computes the quantization information using asymmetric per-block quantization method. @@ -258,7 +259,7 @@ std::tuple, std::vector> quantize_symmetric_per_bl /// /// @return The quantization scale matrix and the quantization zero point matrix. template -std::tuple, std::vector> compute_asymmetric_per_block_quantization_info( +std::tuple compute_asymmetric_per_block_quantization_info( const void* src, size_t height, size_t width, size_t quant_width); /// Quantizes each block of the matrix using asymmetric quantization method. @@ -328,7 +329,7 @@ std::tuple, std::vector> compute_asymmetric_per_bl /// /// @return The quantized data matrix. template -std::vector quantize_asymmetric_per_block( +Buffer quantize_asymmetric_per_block( const void* src, const void* scales, const void* zero_points, size_t height, size_t width, size_t quant_width); /// Dynamically quantizes each block of the matrix using asymmetric quantization method. @@ -355,7 +356,7 @@ std::vector quantize_asymmetric_per_block( /// /// @return The quantized data matrix, the quantization scale matrix and the quantization zero point matrix. template -std::tuple, std::vector, std::vector> quantize_asymmetric_per_block_dynamic( +std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); } // namespace kai::test diff --git a/test/reference/reduce.cpp b/test/reference/reduce.cpp index d4935c3f..7045668d 100644 --- a/test/reference/reduce.cpp +++ b/test/reference/reduce.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -12,6 +12,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/data_format.hpp" #include "test/common/data_type.hpp" #include "test/common/int4.hpp" @@ -30,12 +31,10 @@ T scalar_reduce(T curr_value, T new_value) { } template -std::vector reduce_any_op_type(const void* src, size_t height, size_t width, size_t dimension) { - std::vector dst; - +Buffer reduce_any_op_type(const void* src, size_t height, size_t width, size_t dimension) { switch (dimension) { - case 0: - dst.resize(height * size_in_bits / 8); + case 0: { + Buffer dst(height * size_in_bits / 8); KAI_ASSUME(height * size_in_bits % 8 == 0); for (size_t y = 0; y < height; ++y) { @@ -49,10 +48,11 @@ std::vector reduce_any_op_type(const void* src, size_t height, size_t w write_array(dst.data(), y, acc); } - break; + return dst; + } - case 1: - dst.resize(width * size_in_bits / 8); + case 1: { + Buffer dst(width * size_in_bits / 8); KAI_ASSUME(width * size_in_bits % 8 == 0); for (size_t x = 0; x < width; ++x) { @@ -66,17 +66,16 @@ std::vector reduce_any_op_type(const void* src, size_t height, size_t w write_array(dst.data(), x, acc); } - break; + return dst; + } default: KAI_ERROR("Only 2D data is supported!"); } - - return dst; } template -std::vector reduce_any_op( +Buffer reduce_any_op( const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format, size_t dimension) { KAI_ASSUME(src_format.is_raw()); @@ -106,15 +105,15 @@ std::vector reduce_any_op( } // namespace -std::vector reduce_add( +Buffer reduce_add( const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format, size_t dimension) { return reduce_any_op(src, src_format, height, width, dst_format, dimension); } template -std::vector reduce_add_x(const void* src, size_t height, size_t width) { - std::vector dst(round_up_division(height * size_in_bits, 8)); +Buffer reduce_add_x(const void* src, size_t height, size_t width) { + Buffer dst(round_up_division(height * size_in_bits, 8)); for (size_t y = 0; y < height; ++y) { Accumulator acc = 0; @@ -129,7 +128,7 @@ std::vector reduce_add_x(const void* src, size_t height, size_t width) return dst; } -template std::vector reduce_add_x(const void* src, size_t height, size_t width); +template Buffer reduce_add_x(const void* src, size_t height, size_t width); template T reduce_min(const void* src, size_t len) { diff --git a/test/reference/reduce.hpp b/test/reference/reduce.hpp index 449dc57f..8341f442 100644 --- a/test/reference/reduce.hpp +++ b/test/reference/reduce.hpp @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { class DataFormat; @@ -29,7 +31,7 @@ enum class ReductionOperator : uint32_t { /// @param[in] dimension Reduction dimension. /// /// @return The reduced matrix. -std::vector reduce_add( +Buffer reduce_add( const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format, size_t dimension); @@ -44,7 +46,7 @@ std::vector reduce_add( /// /// @return The vector containing the sum of each input matrix row. template -std::vector reduce_add_x(const void* src, size_t height, size_t width); +Buffer reduce_add_x(const void* src, size_t height, size_t width); /// Retrieve the minimum value in a provided matrix. /// diff --git a/test/reference/reorder.cpp b/test/reference/reorder.cpp index 61ba67d1..2ab8eed2 100644 --- a/test/reference/reorder.cpp +++ b/test/reference/reorder.cpp @@ -10,18 +10,18 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/memory.hpp" #include "test/common/round.hpp" namespace kai::test { template -std::vector reorder_block( - const void* src, size_t height, size_t width, size_t block_height, size_t block_width) { +Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_height, size_t block_width) { const auto num_dst_elements = round_up_multiple(height, block_height) * round_up_multiple(width, block_width); const auto dst_size = round_up_division(num_dst_elements * size_in_bits, 8); - std::vector dst(dst_size); + Buffer dst(dst_size); size_t dst_index = 0; for (size_t y_block = 0; y_block < height; y_block += block_height) { @@ -44,9 +44,9 @@ std::vector reorder_block( return dst; } -template std::vector reorder_block( +template Buffer reorder_block( const void* src, size_t height, size_t width, size_t block_height, size_t block_width); -template std::vector reorder_block( +template Buffer reorder_block( const void* src, size_t height, size_t width, size_t block_height, size_t block_width); } // namespace kai::test diff --git a/test/reference/reorder.hpp b/test/reference/reorder.hpp index 48449e37..514c2ea4 100644 --- a/test/reference/reorder.hpp +++ b/test/reference/reorder.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,8 @@ #include #include +#include "test/common/buffer.hpp" + namespace kai::test { /// Reorders the input matrix block by block. @@ -66,7 +68,6 @@ namespace kai::test { /// @param[in] The reordered matrix. /// ``` template -std::vector reorder_block( - const void* src, size_t height, size_t width, size_t block_height, size_t block_width); +Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_height, size_t block_width); } // namespace kai::test diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp index 84958422..1a3dd8c1 100644 --- a/test/reference/transpose.cpp +++ b/test/reference/transpose.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -12,18 +12,18 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" #include "test/common/memory.hpp" #include "test/common/round.hpp" namespace kai::test { -std::vector transpose(const void* data, DataType data_type, size_t height, size_t width) { +Buffer transpose(const void* data, DataType data_type, size_t height, size_t width) { KAI_ASSUME(data_type_size_in_bits(data_type) % 8 == 0); const auto element_size = data_type_size_in_bits(data_type) / 8; - std::vector output; - output.resize(height * width * element_size); + Buffer output(height * width * element_size); const auto* src_ptr = reinterpret_cast(data); @@ -39,10 +39,10 @@ std::vector transpose(const void* data, DataType data_type, size_t heig } template -std::vector transpose_with_padding( +Buffer transpose_with_padding( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size) { - std::vector output(dst_size); + Buffer output(dst_size); for (size_t y = 0; y < width; ++y) { for (size_t x = 0; x < height; ++x) { @@ -54,17 +54,17 @@ std::vector transpose_with_padding( return output; } -template std::vector transpose_with_padding( +template Buffer transpose_with_padding( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size); -template std::vector transpose_with_padding( +template Buffer transpose_with_padding( const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride, const size_t dst_size); template -std::vector transpose(const void* src, size_t height, size_t width) { - std::vector dst(round_up_division(height * width * size_in_bits, 8)); +Buffer transpose(const void* src, size_t height, size_t width) { + Buffer dst(round_up_division(height * width * size_in_bits, 8)); for (size_t y = 0; y < width; ++y) { for (size_t x = 0; x < height; ++x) { @@ -75,7 +75,7 @@ std::vector transpose(const void* src, size_t height, size_t width) { return dst; } -template std::vector transpose(const void* src, size_t height, size_t width); -template std::vector transpose(const void* src, size_t height, size_t width); +template Buffer transpose(const void* src, size_t height, size_t width); +template Buffer transpose(const void* src, size_t height, size_t width); } // namespace kai::test diff --git a/test/reference/transpose.hpp b/test/reference/transpose.hpp index 306bc89d..11f031aa 100644 --- a/test/reference/transpose.hpp +++ b/test/reference/transpose.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -10,6 +10,7 @@ #include #include +#include "test/common/buffer.hpp" #include "test/common/data_type.hpp" namespace kai::test { @@ -22,7 +23,7 @@ namespace kai::test { /// @param[in] width Number of columns. /// /// @return The transposed matrix. -std::vector transpose(const void* data, DataType data_type, size_t height, size_t width); +Buffer transpose(const void* data, DataType data_type, size_t height, size_t width); /// Transposes the matrix. /// Works for non-packed and packed using provided strides. @@ -37,7 +38,7 @@ std::vector transpose(const void* data, DataType data_type, size_t heig /// @return The transposed matrix. /// template -std::vector transpose_with_padding( +Buffer transpose_with_padding( const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size); /// @@ -49,6 +50,6 @@ std::vector transpose_with_padding( /// /// @return The transposed matrix. template -std::vector transpose(const void* src, size_t height, size_t width); +Buffer transpose(const void* src, size_t height, size_t width); } // namespace kai::test diff --git a/test/tests/buffer_test.cpp b/test/tests/buffer_test.cpp index fbd5e259..e6cca044 100644 --- a/test/tests/buffer_test.cpp +++ b/test/tests/buffer_test.cpp @@ -43,7 +43,7 @@ TEST(Buffer, NonePolicy) { const auto buffer = Buffer(buffer_size); - const auto* data = static_cast(buffer.data()); + const auto* data = reinterpret_cast(buffer.data()); ASSERT_NE(data, nullptr); } @@ -99,7 +99,7 @@ TEST(Buffer, ProtectUnderflowPolicy) { const auto buffer = Buffer(buffer_size); - const auto* data = static_cast(buffer.data()); + const auto* data = reinterpret_cast(buffer.data()); ASSERT_NE(data, nullptr); ASSERT_NE(data, MAP_FAILED); @@ -141,7 +141,7 @@ TEST(Buffer, ProtectOverflowPolicy) { const auto buffer = Buffer(buffer_size); - const auto* data = static_cast(buffer.data()); + const auto* data = reinterpret_cast(buffer.data()); ASSERT_NE(data, nullptr); ASSERT_NE(data, MAP_FAILED); diff --git a/test/tests/imatmul_test.cpp b/test/tests/imatmul_test.cpp index dd12fd4f..70b28e55 100644 --- a/test/tests/imatmul_test.cpp +++ b/test/tests/imatmul_test.cpp @@ -20,6 +20,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/matmul_test_common.hpp" @@ -121,9 +122,6 @@ struct IndirectMatMul { MatMulIndirectKernel imatmul; }; -/// Simple byte buffer -using Buffer = std::vector; - /// Convenience type for test list using IndirectMatMulArray = std::array; diff --git a/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp index 2363ace7..66f92189 100644 --- a/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp +++ b/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp @@ -18,6 +18,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -125,13 +126,13 @@ private: protected: /// Cached test data that is shared between multiple test case. struct TestData { - std::vector lhs{}; ///< LHS operand. - std::vector ref_packed_lhs{}; ///< Reference packed LHS. - std::vector rhs{}; ///< RHS operand. - std::vector rhs_scales{}; ///< RHS per-row quantization scales. - std::vector bias{}; ///< Bias. - std::vector ref_packed_rhs{}; ///< Reference packed RHS. - std::vector ref_dst{}; ///< Reference output. + Buffer lhs{}; ///< LHS operand. + Buffer ref_packed_lhs{}; ///< Reference packed LHS. + Buffer rhs{}; ///< RHS operand. + Buffer rhs_scales{}; ///< RHS per-row quantization scales. + Buffer bias{}; ///< Bias. + Buffer ref_packed_rhs{}; ///< Reference packed RHS. + Buffer ref_dst{}; ///< Reference output. }; /// Gets the test data for the current test case. @@ -154,7 +155,7 @@ protected: const auto lhs_h = info.m; const auto lhs_w = info.k; auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0); - std::vector ref_packed_lhs; + Buffer ref_packed_lhs; if (has_lhs_pack) { ref_packed_lhs = @@ -165,7 +166,7 @@ protected: const auto rhs_w = info.n; auto rhs = fill_matrix_random(rhs_h, rhs_w, method.rhs_format, 1); - std::vector rhs_scales; + Buffer rhs_scales; if (data_type_is_quantized(method.rhs_format.data_type()) && method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) { rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2); @@ -173,14 +174,13 @@ protected: const auto bias_h = 1; const auto bias_w = info.n; - std::vector bias; + Buffer bias; if (has_bias) { bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3); } - std::vector packed_rhs; - packed_rhs.resize(method.fn_get_packed_rhs_size(rhs_w, rhs_h)); + Buffer packed_rhs(method.fn_get_packed_rhs_size(rhs_w, rhs_h)); if (has_rhs_pack) { const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w); @@ -257,9 +257,8 @@ TEST_P(MatMulTestBf16OutFp16, Output) { const auto lhs_start_row = rect.start_row(); const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w); - std::vector lhs_data; const size_t lhs_packed_size = method.fn_get_packed_lhs_size(info.m, info.k, method.m0, method.k0, 1 /* sr */); - lhs_data.resize(lhs_packed_size); + Buffer lhs_data(lhs_packed_size); uintptr_t lhs_offset = method.fn_get_lhs_offset(lhs_start_row, lhs_stride); uintptr_t lhs_packed_offset = method.fn_get_packed_lhs_offset(lhs_start_row, info.k); @@ -271,9 +270,8 @@ TEST_P(MatMulTestBf16OutFp16, Output) { const auto rhs_stride = method.rhs_format.default_row_stride(info.n); - std::vector rhs_data; const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k); - rhs_data.resize(rhs_packed_size); + Buffer rhs_data(rhs_packed_size); const auto packed_rhs_start_row = rect.start_col(); const auto packed_rhs_start_col = 0; @@ -309,8 +307,7 @@ TEST_P(MatMulTestBf16OutFp16, Output) { const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n); ASSERT_EQ(dst_size, ref_dst_size); - std::vector dst; - dst.resize(dst_size); + Buffer dst(dst_size); method.main_kernel( rect.height(), rect.width(), info.k, lhs_data.data() + lhs_packed_offset, rhs_data.data() + rhs_packed_offset, NULL, dst.data() + dst_offset, lhs_stride, rhs_stride, dst_stride, -std::numeric_limits::infinity(), diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp index de55cecf..795abc06 100644 --- a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp @@ -22,6 +22,7 @@ #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp_qsi4cxp_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -93,7 +94,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) { // Generates input data. const auto ref_lhs_f16 = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); - std::vector ref_biases; + Buffer ref_biases; if (has_bias) { ref_biases = fill_random(N, seed + 2); @@ -128,7 +129,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) { // Runs the LHS packing micro-kernel. const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(uint16_t); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon(lhs_start_row, lhs_stride); @@ -145,7 +146,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) { ref_rhs_qsi4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(N, K, nr, kr, sr); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(rhs_start_row, K, nr, kr, sr); auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); @@ -157,7 +158,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) { params.rhs_zero_point = 0; kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0( - 1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), + 1, N, K, nr, kr, sr, reinterpret_cast(ref_rhs_qsi4_padded.data()), has_bias ? reinterpret_cast(ref_biases.data()) : nullptr, reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); @@ -171,7 +172,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col, diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp index 0500d8f6..3f59f645 100644 --- a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp @@ -24,6 +24,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -94,7 +95,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) { // Generates input data. const auto ref_lhs_f16 = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); - std::vector ref_biases; + Buffer ref_biases; if (has_bias) { ref_biases = fill_random(N, seed + 2); @@ -129,7 +130,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) { // Runs the LHS packing micro-kernel. const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(uint16_t); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon(lhs_start_row, lhs_stride); @@ -143,7 +144,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) { imp_packed_lhs.data() + lhs_packed_offset); const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(rhs_start_row, K, nr, kr, sr); auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); @@ -166,7 +167,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col, diff --git a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp index de6c3a90..14838981 100644 --- a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp +++ b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp @@ -20,6 +20,7 @@ #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p_qai4c32p_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -85,7 +86,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { // Generates input data. const auto ref_lhs_f16 = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); - std::vector ref_biases; + Buffer ref_biases; if (has_bias) { ref_biases = fill_random(N, seed + 2); @@ -121,7 +122,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(M, K, bl, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(uint16_t); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(lhs_start_row, lhs_stride); @@ -139,7 +140,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { const size_t num_blocks_per_row = round_up_division(K, bl); const size_t ref_zp_size = N * num_blocks_per_row; const size_t ref_zp_size_in_bytes = ref_zp_size * sizeof(float); - std::vector ref_rhs_zp_f32(ref_zp_size_in_bytes); + Buffer ref_rhs_zp_f32(ref_zp_size_in_bytes); for (size_t i = 0; i < ref_zp_size; ++i) { reinterpret_cast(ref_rhs_zp_f32.data())[i] = -reinterpret_cast(ref_rhs_zero_points.data())[i] * @@ -154,7 +155,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(N, K, nr, kr, bl); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(rhs_start_row, K, nr, kr, bl); @@ -167,8 +168,8 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { params.rhs_zero_point = 8; kai_run_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon( - 1, N, K, nr, kr, sr, bl, ref_rhs_qau4s0s1.data(), ref_rhs_zp_f32.data(), has_bias ? ref_biases.data() : nullptr, - ref_rhs_scales.data(), imp_packed_rhs.data(), 0, ¶ms); + 1, N, K, nr, kr, sr, bl, reinterpret_cast(ref_rhs_qau4s0s1.data()), ref_rhs_zp_f32.data(), + has_bias ? ref_biases.data() : nullptr, ref_rhs_scales.data(), imp_packed_rhs.data(), 0, ¶ms); const auto dst_stride_row = N * sizeof(uint16_t); const auto dst_stride_col = sizeof(uint16_t); @@ -180,7 +181,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col, diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp index 2b4ca713..5dd2f3b9 100644 --- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp +++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp @@ -19,6 +19,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -322,13 +323,13 @@ private: protected: /// Cached test data that is shared between multiple test case. struct TestData { - std::vector lhs{}; ///< LHS operand. - std::vector ref_packed_lhs{}; ///< Reference packed LHS. - std::vector rhs{}; ///< RHS operand. - std::vector rhs_scales{}; ///< RHS per-row quantization scales. - std::vector bias{}; ///< Bias. - std::vector ref_packed_rhs{}; ///< Reference packed RHS. - std::vector ref_dst{}; ///< Reference output. + Buffer lhs{}; ///< LHS operand. + Buffer ref_packed_lhs{}; ///< Reference packed LHS. + Buffer rhs{}; ///< RHS operand. + Buffer rhs_scales{}; ///< RHS per-row quantization scales. + Buffer bias{}; ///< Bias. + Buffer ref_packed_rhs{}; ///< Reference packed RHS. + Buffer ref_dst{}; ///< Reference output. }; /// Gets the test data for the current test case. @@ -351,7 +352,7 @@ protected: const auto lhs_h = info.m; const auto lhs_w = info.k; auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0); - std::vector ref_packed_lhs; + Buffer ref_packed_lhs; if (has_lhs_pack) { ref_packed_lhs = @@ -362,7 +363,7 @@ protected: const auto rhs_w = info.n; auto rhs = fill_matrix_random(rhs_h, rhs_w, method.rhs_format, 1); - std::vector rhs_scales; + Buffer rhs_scales; if (data_type_is_quantized(method.rhs_format.data_type()) && method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) { rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2); @@ -370,7 +371,7 @@ protected: const auto bias_h = 1; const auto bias_w = info.n; - std::vector bias; + Buffer bias; if (has_bias) { bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3); @@ -379,11 +380,11 @@ protected: constexpr size_t nr = 12; constexpr size_t kr = 4; - std::vector packed_rhs; + Buffer packed_rhs; if (method.fn_get_packed_rhs_size) { - packed_rhs.resize(method.fn_get_packed_rhs_size(rhs_w, rhs_h)); + packed_rhs = Buffer(method.fn_get_packed_rhs_size(rhs_w, rhs_h)); } else if (method.fn_get_packed_rhs_size_generic_block_size) { - packed_rhs.resize(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr)); + packed_rhs = Buffer(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr)); } else { KAI_ERROR("No function to calculate Packed Rhs Matrix Size"); } @@ -462,9 +463,8 @@ TEST_P(MatMulTestBf16, Output) { const auto lhs_start_row = rect.start_row(); const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w); - std::vector lhs_data; const size_t lhs_packed_size = method.fn_get_packed_lhs_size(info.m, info.k, method.m0, method.k0, 1 /* sr */); - lhs_data.resize(lhs_packed_size); + Buffer lhs_data(lhs_packed_size); uintptr_t lhs_offset = method.fn_get_lhs_offset(lhs_start_row, lhs_stride); uintptr_t lhs_packed_offset = method.fn_get_packed_lhs_offset(lhs_start_row, info.k); @@ -476,15 +476,15 @@ TEST_P(MatMulTestBf16, Output) { const auto rhs_stride = method.rhs_format.default_row_stride(info.n); - std::vector rhs_data; + Buffer rhs_data; if (method.fn_get_packed_rhs_size_generic_block_size) { const size_t rhs_packed_size = method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0); - rhs_data.resize(rhs_packed_size); + rhs_data = Buffer(rhs_packed_size); } else if (method.fn_get_packed_rhs_size) { const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k); - rhs_data.resize(rhs_packed_size); + rhs_data = Buffer(rhs_packed_size); } const auto packed_rhs_start_row = rect.start_col(); @@ -521,8 +521,7 @@ TEST_P(MatMulTestBf16, Output) { const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n); ASSERT_EQ(dst_size, ref_dst_size); - std::vector dst; - dst.resize(dst_size); + Buffer dst(dst_size); method.main_kernel( rect.height(), rect.width(), info.k, lhs_data.data() + lhs_packed_offset, rhs_data.data() + rhs_packed_offset, NULL, dst.data() + dst_offset, lhs_stride, rhs_stride, dst_stride, -std::numeric_limits::infinity(), diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp index c80e42ad..00774b1e 100644 --- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp +++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp @@ -72,7 +72,7 @@ Buffer fill_matrix_raw(size_t height, size_t width, std::function % 8 == 0); Buffer data(size); - auto ptr = static_cast(data.data()); + auto ptr = reinterpret_cast(data.data()); for (size_t y = 0; y < height; ++y) { for (size_t x = 0; x < width; ++x) { @@ -170,7 +170,7 @@ TEST_P(MatMulTest_f32_f32_f32p, EndToEnd) // NOLINT(google-readability-avoid-un Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - m, n, k, ref_lhs.data(), 1, imp_packed_rhs->data(), static_cast(imp_dst.data()), 1, 1, clamp_min, + m, n, k, ref_lhs.data(), 1, imp_packed_rhs->data(), reinterpret_cast(imp_dst.data()), 1, 1, clamp_min, clamp_max); // Compare the output of the micro-kernels against the output of the reference implementation. diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index a8d3fb8f..65304569 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -36,6 +36,7 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h" #include "test/common/bfloat16.hpp" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/int4.hpp" #include "test/common/matmul_test_common.hpp" @@ -203,7 +204,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -227,7 +228,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = @@ -245,7 +246,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { params.scale_dt = kai_datatype::kai_dt_bf16; kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride, + 1, rect.width() /* n */, K, nr, kr, sr, bl, + reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, reinterpret_cast(ref_biases.data() + bias_offset), reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); @@ -259,7 +261,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { ASSERT_EQ(imp_dst_size, ref_dst.size()); // Runs the GEMM micro-kernel. - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), @@ -349,7 +351,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -382,7 +384,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; @@ -390,7 +392,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { params.scale_dt = kai_datatype::kai_dt_bf16; kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride, + 1, rect.width() /* n */, K, nr, kr, sr, bl, + reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, reinterpret_cast(ref_biases.data() + bias_offset), ref_rhs_scales.data() + scale_offset, ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); @@ -402,7 +405,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 5a770fbe..6459c87b 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -33,6 +33,7 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/int4.hpp" #include "test/common/matmul_test_common.hpp" @@ -325,7 +326,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -353,13 +354,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { size_t bias_offset = rhs_start_row * sizeof(float); size_t scale_offset = rhs_start_row * sizeof(float); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 0; ukernel_variant.run_rhs_pack( - 1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsi4_padded.data() + rhs_offset, + 1, rect.width() /* n */, K, nr, kr, sr, + reinterpret_cast(ref_rhs_qsi4_padded.data() + rhs_offset), reinterpret_cast(ref_biases.data() + bias_offset), reinterpret_cast(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); @@ -372,7 +374,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), @@ -452,7 +454,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -481,12 +483,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { size_t bias_offset = rhs_start_row * sizeof(float); size_t scale_offset = rhs_start_row * sizeof(float); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 8; ukernel_variant.run_rhs_pack( - 1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsu4_padded.data() + rhs_offset, + 1, rect.width() /* n */, K, nr, kr, sr, + reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), reinterpret_cast(ref_biases.data() + bias_offset), reinterpret_cast(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); @@ -498,7 +501,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), @@ -592,7 +595,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); @@ -615,13 +618,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 0; ukernel_variant.run_rhs_pack( - 1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), reinterpret_cast(ref_biases.data()), - reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + 1, N, K, nr, kr, sr, reinterpret_cast(ref_rhs_qsi4_padded.data()), + reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), + imp_packed_rhs.data(), 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -631,7 +635,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), @@ -723,7 +727,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -748,13 +752,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K); ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 8; ukernel_variant.run_rhs_pack( - 1, N, K, nr, kr, sr, ref_rhs_qsu4_padded.data(), reinterpret_cast(ref_biases.data()), - reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + 1, N, K, nr, kr, sr, reinterpret_cast(ref_rhs_qsu4_padded.data()), + reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), + imp_packed_rhs.data(), 0, ¶ms); const auto dst_stride = N * sizeof(float); const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); @@ -764,7 +769,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp index f22671a1..64a079cd 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp @@ -24,6 +24,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/matmul_test_common.hpp" #include "test/common/matrix_portion.hpp" @@ -145,7 +146,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); const auto lhs_start_row = rect.start_row(); size_t lhs_stride = K * sizeof(float); @@ -162,7 +163,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) { // * Packs the RHS matrix. const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_qsi8cx_params params{.lhs_zero_point = 1, .scale_multiplier = 1.0f}; kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon( 1, N, K, nr, kr, sr, reinterpret_cast(ref_rhs_qsi8.data()), @@ -186,7 +187,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + matmul_lhs_packed_offset, imp_packed_rhs.data() + matmul_rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), @@ -276,7 +277,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) { // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height(), K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data() + lhs_offset), K * sizeof(float), imp_packed_lhs.data() + lhs_packed_offset); @@ -286,7 +287,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) { // * Packs the RHS matrix. const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_qsi8cx_params params{.lhs_zero_point = 1, .scale_multiplier = 1.0f}; kai_run_rhs_pack_kxn_qsi8cxp_qsi8cx_neon( 1, N, K, nr, kr, sr, reinterpret_cast(ref_rhs_qsi8.data()), @@ -310,7 +311,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, imp_packed_lhs.data() + matmul_lhs_packed_offset, imp_packed_rhs.data() + matmul_rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp index 14184151..da00f5cc 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp @@ -20,6 +20,7 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p_qai4c32p_interface.h" #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon.h" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/int4.hpp" #include "test/common/matmul_test_common.hpp" @@ -83,7 +84,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { // Generates input data. const auto ref_lhs = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); - std::vector ref_biases; + Buffer ref_biases; if (has_bias) { ref_biases = fill_random(N, seed + 2); @@ -112,7 +113,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(M, K, bl, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(float); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(lhs_start_row, lhs_stride); @@ -130,7 +131,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { const size_t num_blocks_per_row = round_up_division(K, bl); const size_t ref_zp_size = N * num_blocks_per_row; const size_t ref_zp_size_in_bytes = ref_zp_size * sizeof(float); - std::vector ref_rhs_zp_f32(ref_zp_size_in_bytes); + Buffer ref_rhs_zp_f32(ref_zp_size_in_bytes); for (size_t i = 0; i < ref_zp_size; ++i) { reinterpret_cast(ref_rhs_zp_f32.data())[i] = -reinterpret_cast(ref_rhs_zero_points.data())[i] * @@ -145,7 +146,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(N, K, nr, kr, bl); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(rhs_start_row, K, nr, kr, bl); @@ -158,8 +159,8 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { params.rhs_zero_point = 8; kai_run_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon( - 1, N, K, nr, kr, sr, bl, ref_rhs_qau4s0s1.data(), ref_rhs_zp_f32.data(), has_bias ? ref_biases.data() : nullptr, - ref_rhs_scales.data(), imp_packed_rhs.data(), 0, ¶ms); + 1, N, K, nr, kr, sr, bl, reinterpret_cast(ref_rhs_qau4s0s1.data()), ref_rhs_zp_f32.data(), + has_bias ? ref_biases.data() : nullptr, ref_rhs_scales.data(), imp_packed_rhs.data(), 0, ¶ms); const auto dst_stride_row = N * sizeof(float); const auto dst_stride_col = sizeof(float); @@ -171,7 +172,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp index a93a0b65..4d82a9d7 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp @@ -28,6 +28,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/float16.hpp" #include "test/common/int4.hpp" @@ -208,7 +209,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) { // Runs the LHS packing micro-kernel. const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = ukernel_variant.pack_interface.lhs_packed_size(M, K, bl, mr, kr, sr); - std::vector imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(float); auto lhs_offset = ukernel_variant.pack_interface.get_lhs_offset(lhs_start_row, lhs_stride); @@ -227,7 +228,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) { pack_data_scales_interleave_block(ref_rhs_qsu4.data(), ref_rhs_scales.data(), N, K, bl); const auto imp_packed_rhs_size = ukernel_variant.pack_interface.rhs_packed_size(N, K, nr, kr, bl); - std::vector imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size); const auto rhs_start_row = rect.start_col(); auto rhs_packed_offset = ukernel_variant.pack_interface.get_rhs_packed_offset(rhs_start_row, K, nr, kr, bl); auto rhs_matmul_offset = ukernel_variant.ukernel.interface.get_rhs_packed_offset(rhs_start_row, K, bl); @@ -235,7 +236,8 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) { const kai_rhs_pack_qs4cxs1s0_param params{.lhs_zero_point = 1, .rhs_zero_point = 8}; ukernel_variant.pack_interface.rhs_pack( - 1, N, K, nr, kr, sr, bl, ref_rhs_qsu4_scale_f16.data(), nullptr, imp_packed_rhs.data(), 0, ¶ms); + 1, N, K, nr, kr, sr, bl, reinterpret_cast(ref_rhs_qsu4_scale_f16.data()), nullptr, + imp_packed_rhs.data(), 0, ¶ms); const auto dst_stride_row = N * sizeof(float); const auto dst_stride_col = sizeof(float); @@ -247,7 +249,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.ukernel.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - std::vector imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.ukernel.interface.run_matmul( rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp index e3c1cb91..3dacdf78 100644 --- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp @@ -30,6 +30,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_pack_x8p2vlx4_x8_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme.h" +#include "test/common/buffer.hpp" #include "test/common/cpu_info.hpp" #include "test/common/matmul_test_common.hpp" #include "test/common/matrix_portion.hpp" @@ -51,7 +52,6 @@ namespace kai::test { // Ensure static linkage for all functionality local to this test file namespace { -using Buffer = std::vector; using IndirectionBuffer = std::vector; struct KChunk { @@ -458,7 +458,7 @@ const TestReference& get_test_reference(const TestDataId& test_data_id) { const size_t idx = m_i * k_chunk_count + k_chunk_idx; if (pad_testing and m_i == 0) { // Push padding pointers for first row - lhs_qai8_indirect[idx] = lhs_padding.data(); + lhs_qai8_indirect[idx] = reinterpret_cast(lhs_padding.data()); } else { uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len; lhs_qai8_indirect[idx] = reinterpret_cast(offset); @@ -591,11 +591,14 @@ void test_lhs_pack( output_area.end_row(), shape.k, variant.acc_pack.m, variant.acc_pack.k, 1) : imp_packed_lhs_size; + const auto* imp_packed_lhs_ptr = reinterpret_cast(imp_packed_lhs.data()); + const auto* ref_packed_lhs_ptr = reinterpret_cast(reference.packed_lhs.data()); + for (size_t i = 0; i < reference.packed_lhs.size(); ++i) { if (i >= imp_packed_lhs_offset && i < imp_packed_lhs_end_offset) { - ASSERT_EQ(imp_packed_lhs[i], reference.packed_lhs[i]); + ASSERT_EQ(imp_packed_lhs_ptr[i], ref_packed_lhs_ptr[i]); } else { - ASSERT_EQ(imp_packed_lhs[i], 0); + ASSERT_EQ(imp_packed_lhs_ptr[i], 0); } } } @@ -627,13 +630,16 @@ void test_rhs_pack( : imp_packed_rhs_size; size_t mismatches = 0; + const auto* imp_packed_rhs_ptr = reinterpret_cast(imp_packed_rhs.data()); + const auto* ref_packed_rhs_ptr = reinterpret_cast(reference.packed_rhs.data()); + for (size_t i = 0; i < reference.packed_rhs.size(); ++i) { if (i >= imp_packed_rhs_offset && i < imp_packed_rhs_end_offset) { - if (imp_packed_rhs[i] != reference.packed_rhs[i]) { + if (imp_packed_rhs_ptr[i] != ref_packed_rhs_ptr[i]) { mismatches += 1; } } else { - if (imp_packed_rhs[i] != 0) { + if (imp_packed_rhs_ptr[i] != 0) { mismatches += 1; } } diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index dbae9718..3c41541c 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -19,6 +19,7 @@ #include #include "kai/kai_common.h" +#include "test/common/buffer.hpp" #include "test/common/compare.hpp" #include "test/common/cpu_info.hpp" #include "test/common/data_format.hpp" @@ -265,16 +266,16 @@ private: protected: /// Cached test data that is shared between multiple test case. struct TestData { - std::vector lhs{}; ///< LHS operand. - std::vector ref_packed_lhs{}; ///< Reference packed LHS. - std::vector rhs{}; ///< RHS operand. - std::vector rhs_scales{}; ///< RHS per-row quantization scales. - std::vector bias{}; ///< Bias. - std::vector rhs_t{}; ///< Transposed RHS matrix. - std::vector ref_packed_rhs{}; ///< Reference packed RHS. - std::vector ref_dst{}; ///< Reference output. - float clamp_min{}; ///< Minimum output value. - float clamp_max{}; ///< Maximum output value. + Buffer lhs{}; ///< LHS operand. + Buffer ref_packed_lhs{}; ///< Reference packed LHS. + Buffer rhs{}; ///< RHS operand. + Buffer rhs_scales{}; ///< RHS per-row quantization scales. + Buffer bias{}; ///< Bias. + Buffer rhs_t{}; ///< Transposed RHS matrix. + Buffer ref_packed_rhs{}; ///< Reference packed RHS. + Buffer ref_dst{}; ///< Reference output. + float clamp_min{}; ///< Minimum output value. + float clamp_max{}; ///< Maximum output value. }; /// Gets the test data for the current test case. @@ -297,7 +298,7 @@ protected: const auto lhs_h = info.m; const auto lhs_w = info.k; auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0); - std::vector ref_packed_lhs; + Buffer ref_packed_lhs; if (has_lhs_pack) { ref_packed_lhs = @@ -311,7 +312,7 @@ protected: KAI_ASSUME(method.rhs_format.is_raw()); auto rhs_t = transpose(rhs.data(), method.rhs_format.data_type(), rhs_h, rhs_w); - std::vector rhs_scales; + Buffer rhs_scales; if (data_type_is_quantized(method.rhs_format.data_type()) && method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) { rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2); @@ -319,17 +320,17 @@ protected: const auto bias_h = 1; const auto bias_w = info.n; - std::vector bias; + Buffer bias; if (has_bias) { bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3); } - std::vector packed_rhs; + Buffer packed_rhs; if (has_rhs_pack) { packed_rhs = matmul_pack_rhs( - rhs.data(), !rhs_scales.empty() ? rhs_scales.data() : nullptr, bias.data(), method.rhs_format, - method.packed_rhs_format, info.n, info.k, true); + rhs.data(), rhs_scales.data(), bias.data(), method.rhs_format, method.packed_rhs_format, info.n, info.k, + true); } KAI_ASSUME(method.lhs_format.is_raw()); @@ -439,8 +440,7 @@ TEST_P(MatMulTest, PackedLhs) { const auto ref_packed_lhs_offset = method.packed_lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w); ASSERT_EQ(packed_lhs_offset, ref_packed_lhs_offset); - std::vector packed_lhs; - packed_lhs.resize(packed_lhs_size); + Buffer packed_lhs(packed_lhs_size); method.fn_pack_lhs( rect.height(), rect.width(), mr, kr, sr, 0, data.lhs.data() + lhs_offset, ref_lhs_row_stride, packed_lhs.data() + packed_lhs_offset); @@ -509,10 +509,10 @@ TEST_P(MatMulTest, PackedRhs) { ASSERT_EQ(bias_offset, ref_bias_offset); /** Perform RHS packing, and compare with reference result **/ - std::vector packed_rhs(packed_rhs_size, 0); + Buffer packed_rhs(packed_rhs_size, 0); method.pack_rhs( height, width, data.rhs.data() + rhs_offset, rhs_row_stride, data.bias.data() + bias_offset, - !data.rhs_scales.empty() ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr, + data.rhs_scales.data() != nullptr ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr, packed_rhs.data() + packed_rhs_offset); const bool exact = method.packed_rhs_format.pack_format() != DataFormat::PackFormat::QUANTIZE_PER_ROW; @@ -570,12 +570,11 @@ TEST_P(MatMulTest, PackedTransposedRhs) { const auto ref_bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), info.n); ASSERT_EQ(bias_offset, ref_bias_offset); - std::vector packed_rhs; - packed_rhs.resize(packed_rhs_size); + Buffer packed_rhs(packed_rhs_size); method.pack_rhs_nxk( rect.height(), rect.width(), data.rhs_t.data() + rhs_offset, ref_rhs_row_stride, data.bias.data() + bias_offset, - !data.rhs_scales.empty() ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr, + data.rhs_scales.data() != nullptr ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr, packed_rhs.data() + packed_rhs_offset); const auto exact = method.packed_rhs_format.pack_format() != DataFormat::PackFormat::QUANTIZE_PER_ROW; @@ -619,7 +618,7 @@ TEST_P(MatMulTest, Output) { const auto lhs_start_col = 0; const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w); - const uint8_t* lhs_data = nullptr; + const std::byte* lhs_data = nullptr; uintptr_t lhs_offset = 0; if (method.is_pack_lhs_needed()) { @@ -639,7 +638,7 @@ TEST_P(MatMulTest, Output) { const auto rhs_stride = method.rhs_format.default_row_stride(rhs_w); - const uint8_t* rhs_data = nullptr; + const std::byte* rhs_data = nullptr; uintptr_t rhs_offset = 0; if (method.is_pack_rhs_needed()) { @@ -672,8 +671,7 @@ TEST_P(MatMulTest, Output) { const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n); ASSERT_EQ(dst_size, ref_dst_size); - std::vector dst; - dst.resize(dst_size); + Buffer dst(dst_size); method.main_kernel( rect.height(), rect.width(), info.k, lhs_data + lhs_offset, rhs_data + rhs_offset, bias_data + bias_offset, -- GitLab From d4334fe689aa39e0a6fcc5bdfd5c41694b5cc9f4 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Wed, 7 May 2025 10:56:04 +0100 Subject: [PATCH 2/3] Remove unused headers and update Buffer class * Buffer class now by default doesn't initialize the data buffer. * There is a separate constructor to initialize the buffer with user-provided value. Signed-off-by: Viet-Hoa Do --- test/common/buffer.cpp | 4 +++- test/common/buffer.hpp | 3 ++- test/common/int4.hpp | 1 - test/reference/binary_elementwise.cpp | 1 - test/reference/binary_elementwise.hpp | 2 -- test/reference/cast.cpp | 1 - test/reference/cast.hpp | 2 -- test/reference/clamp.cpp | 2 -- test/reference/clamp.hpp | 2 -- test/reference/fill.cpp | 1 - test/reference/fill.hpp | 1 - test/reference/matmul.cpp | 1 - test/reference/matmul.hpp | 1 - test/reference/matmul_pack.cpp | 2 -- test/reference/matmul_pack.hpp | 2 -- test/reference/pack.cpp | 1 - test/reference/pack.hpp | 2 -- test/reference/pad.cpp | 1 - test/reference/pad.hpp | 1 - test/reference/quantize.cpp | 1 - test/reference/quantize.hpp | 1 - test/reference/reduce.cpp | 1 - test/reference/reduce.hpp | 1 - test/reference/reorder.cpp | 2 +- test/reference/reorder.hpp | 2 -- test/reference/transpose.cpp | 1 - test/reference/transpose.hpp | 2 -- .../matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp | 1 - .../matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp | 1 - ...atmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp | 1 - .../matmul_clamp_f32_bf16p_bf16p_test.cpp | 1 - test/tests/matmul_clamp_f32_f32_f32p_test.cpp | 1 - ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 1 - .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 1 - .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp | 1 - ...atmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp | 1 - ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 1 - .../matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp | 21 ++++++++----------- test/tests/matmul_test.cpp | 7 +++---- 39 files changed, 18 insertions(+), 62 deletions(-) diff --git a/test/common/buffer.cpp b/test/common/buffer.cpp index 3945eba2..86343d0d 100644 --- a/test/common/buffer.cpp +++ b/test/common/buffer.cpp @@ -21,7 +21,7 @@ namespace kai::test { -Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size) { +Buffer::Buffer(const size_t size) : m_user_buffer_size(size) { KAI_ASSUME_MSG(size > 0, "Buffers must be of non-zero size"); const char* val = getenv("KAI_TEST_BUFFER_POLICY"); @@ -57,7 +57,9 @@ Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size) default: allocate(); } +} +Buffer::Buffer(const size_t size, uint8_t init_value) : Buffer(size) { memset(data(), init_value, size); } diff --git a/test/common/buffer.hpp b/test/common/buffer.hpp index 4cb06e4b..8224f1f0 100644 --- a/test/common/buffer.hpp +++ b/test/common/buffer.hpp @@ -30,7 +30,8 @@ class Buffer { public: Buffer() = default; - Buffer(size_t size, uint8_t init_value = 0); + explicit Buffer(size_t size); + Buffer(size_t size, uint8_t init_value); Buffer(const Buffer& other) = delete; Buffer(Buffer&& other) noexcept = default; diff --git a/test/common/int4.hpp b/test/common/int4.hpp index aa05d9bd..73d031d5 100644 --- a/test/common/int4.hpp +++ b/test/common/int4.hpp @@ -8,7 +8,6 @@ #include #include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/binary_elementwise.cpp b/test/reference/binary_elementwise.cpp index 48434e47..8212195e 100644 --- a/test/reference/binary_elementwise.cpp +++ b/test/reference/binary_elementwise.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/binary_elementwise.hpp b/test/reference/binary_elementwise.hpp index 713f8692..d66d7e36 100644 --- a/test/reference/binary_elementwise.hpp +++ b/test/reference/binary_elementwise.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/reference/cast.cpp b/test/reference/cast.cpp index e11cb350..d6728926 100644 --- a/test/reference/cast.cpp +++ b/test/reference/cast.cpp @@ -8,7 +8,6 @@ #include #include -#include #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" diff --git a/test/reference/cast.hpp b/test/reference/cast.hpp index 8dc09b22..ce9c06c5 100644 --- a/test/reference/cast.hpp +++ b/test/reference/cast.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/reference/clamp.cpp b/test/reference/clamp.cpp index eadc755e..c797d3a9 100644 --- a/test/reference/clamp.cpp +++ b/test/reference/clamp.cpp @@ -8,8 +8,6 @@ #include #include -#include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/clamp.hpp b/test/reference/clamp.hpp index 52ca57ba..8dacee33 100644 --- a/test/reference/clamp.hpp +++ b/test/reference/clamp.hpp @@ -7,9 +7,7 @@ #pragma once #include -#include #include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/reference/fill.cpp b/test/reference/fill.cpp index 82068fdb..0e155340 100644 --- a/test/reference/fill.cpp +++ b/test/reference/fill.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" diff --git a/test/reference/fill.hpp b/test/reference/fill.hpp index 9dd0f26c..29c9cf3b 100644 --- a/test/reference/fill.hpp +++ b/test/reference/fill.hpp @@ -8,7 +8,6 @@ #include #include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index be5bb6a6..81613212 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp index d349d0b4..8ef06490 100644 --- a/test/reference/matmul.hpp +++ b/test/reference/matmul.hpp @@ -8,7 +8,6 @@ #include #include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/reference/matmul_pack.cpp b/test/reference/matmul_pack.cpp index 40139e09..973cf9b5 100644 --- a/test/reference/matmul_pack.cpp +++ b/test/reference/matmul_pack.cpp @@ -7,8 +7,6 @@ #include "test/reference/matmul_pack.hpp" #include -#include -#include #include "test/common/buffer.hpp" #include "test/common/round.hpp" diff --git a/test/reference/matmul_pack.hpp b/test/reference/matmul_pack.hpp index ea713dd7..5bf57659 100644 --- a/test/reference/matmul_pack.hpp +++ b/test/reference/matmul_pack.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/pack.cpp b/test/reference/pack.cpp index e06e3303..cb193917 100644 --- a/test/reference/pack.cpp +++ b/test/reference/pack.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/bfloat16.hpp" diff --git a/test/reference/pack.hpp b/test/reference/pack.hpp index b424d5ba..29180290 100644 --- a/test/reference/pack.hpp +++ b/test/reference/pack.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp index 182f6dc8..e857ce74 100644 --- a/test/reference/pad.cpp +++ b/test/reference/pad.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp index 74c0229a..c21b8c9a 100644 --- a/test/reference/pad.hpp +++ b/test/reference/pad.hpp @@ -8,7 +8,6 @@ #include #include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index c64d8404..008f6676 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include "test/common/bfloat16.hpp" #include "test/common/buffer.hpp" diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index e0c7d1f0..d0aa3bcd 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/reduce.cpp b/test/reference/reduce.cpp index 7045668d..085ac063 100644 --- a/test/reference/reduce.cpp +++ b/test/reference/reduce.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/reduce.hpp b/test/reference/reduce.hpp index 8341f442..8f1ccea7 100644 --- a/test/reference/reduce.hpp +++ b/test/reference/reduce.hpp @@ -8,7 +8,6 @@ #include #include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/reorder.cpp b/test/reference/reorder.cpp index 2ab8eed2..12078730 100644 --- a/test/reference/reorder.cpp +++ b/test/reference/reorder.cpp @@ -21,7 +21,7 @@ Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_ const auto num_dst_elements = round_up_multiple(height, block_height) * round_up_multiple(width, block_width); const auto dst_size = round_up_division(num_dst_elements * size_in_bits, 8); - Buffer dst(dst_size); + Buffer dst(dst_size, 0); size_t dst_index = 0; for (size_t y_block = 0; y_block < height; y_block += block_height) { diff --git a/test/reference/reorder.hpp b/test/reference/reorder.hpp index 514c2ea4..8453ce1e 100644 --- a/test/reference/reorder.hpp +++ b/test/reference/reorder.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp index 1a3dd8c1..7c8fbe9e 100644 --- a/test/reference/transpose.cpp +++ b/test/reference/transpose.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/reference/transpose.hpp b/test/reference/transpose.hpp index 11f031aa..f6f8c343 100644 --- a/test/reference/transpose.hpp +++ b/test/reference/transpose.hpp @@ -7,8 +7,6 @@ #pragma once #include -#include -#include #include "test/common/buffer.hpp" #include "test/common/data_type.hpp" diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp index 795abc06..5b529876 100644 --- a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod.h" diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp index 3f59f645..3a1710f8 100644 --- a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h" diff --git a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp index 14838981..5ba1a687 100644 --- a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp +++ b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h" diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp index 5dd2f3b9..de0f4ffb 100644 --- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp +++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp index 00774b1e..7e811c75 100644 --- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp +++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla.h" diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 65304569..b191fd4e 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.h" diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 6459c87b..dd6a27a2 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.h" diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp index 64a079cd..1bad32a6 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h" diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp index da00f5cc..699ced2a 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h" diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp index 4d82a9d7..24326307 100644 --- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h" diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp index 3dacdf78..62d544ad 100644 --- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp +++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "kai/ukernels/matmul/imatmul_clamp_qai8_qai8p_qsi8cxp/kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa.h" @@ -52,8 +51,6 @@ namespace kai::test { // Ensure static linkage for all functionality local to this test file namespace { -using IndirectionBuffer = std::vector; - struct KChunk { size_t count; size_t length; @@ -361,7 +358,7 @@ struct TestReference { Buffer lhs_qai8; Buffer lhs_qai8_scales; Buffer lhs_qai8_zero_points; - IndirectionBuffer lhs_qai8_indirect; + Buffer lhs_qai8_indirect; Buffer lhs_qai8_indirect_packed; Buffer lhs_qai8_indirect_padding; size_t lhs_qai8_indirect_offset; @@ -451,17 +448,18 @@ const TestReference& get_test_reference(const TestDataId& test_data_id) { // Setup an indirection buffer, where each "row" contains `k_chunk_count` // pointers to chunks of length `k_chunk_len` in the input_buffer - IndirectionBuffer lhs_qai8_indirect(shape.m * k_chunk_count); + Buffer lhs_qai8_indirect(shape.m * k_chunk_count * sizeof(void*)); Buffer lhs_padding(k_chunk_len, padding_value); + auto* lhs_qai8_indirect_ptr = reinterpret_cast(lhs_qai8_indirect.data()); for (size_t m_i = 0; m_i < shape.m; ++m_i) { for (size_t k_chunk_idx = 0; k_chunk_idx < k_chunk_count; ++k_chunk_idx) { const size_t idx = m_i * k_chunk_count + k_chunk_idx; if (pad_testing and m_i == 0) { // Push padding pointers for first row - lhs_qai8_indirect[idx] = reinterpret_cast(lhs_padding.data()); + lhs_qai8_indirect_ptr[idx] = reinterpret_cast(lhs_padding.data()); } else { uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len; - lhs_qai8_indirect[idx] = reinterpret_cast(offset); + lhs_qai8_indirect_ptr[idx] = reinterpret_cast(offset); } } } @@ -576,7 +574,7 @@ void test_lhs_pack( variant.lhs_pack->get_packed_lhs_size(shape.m, shape.k, variant.acc_pack.m, variant.acc_pack.k, 1); ASSERT_EQ(imp_packed_lhs_size, reference.packed_lhs.size()); - Buffer imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size, 0); const auto imp_lhs_offset = variant.lhs_pack->get_lhs_offset(output_area.start_row(), shape.k * sizeof(int8_t)); const auto imp_packed_lhs_offset = variant.lhs_pack->get_packed_lhs_offset( output_area.start_row(), shape.k, variant.acc_pack.m, variant.acc_pack.k, 1); @@ -608,7 +606,7 @@ void test_rhs_pack( const MatMulShape& shape, const MatMulVariant& variant, const Rect& output_area, const TestReference& reference) { const auto imp_packed_rhs_size = variant.rhs_pack.get_packed_rhs_size(shape.n, shape.k); ASSERT_EQ(imp_packed_rhs_size, reference.packed_rhs.size()); - Buffer imp_packed_rhs(imp_packed_rhs_size); + Buffer imp_packed_rhs(imp_packed_rhs_size, 0); const auto imp_rhs_offset = variant.rhs_pack.get_rhs_offset(output_area.start_col()); const auto imp_bias_offset = variant.rhs_pack.get_bias_offset(output_area.start_col()); @@ -686,7 +684,7 @@ void test_matmul( const auto imp_dst_size = variant.matmul.get_dst_size(shape.m, shape.n); ASSERT_EQ(imp_dst_size, reference.dst_qsi8_clamped.size()); - Buffer imp_dst(imp_dst_size); + Buffer imp_dst(imp_dst_size, 0); const auto [imp_lhs_offset, lhs_data] = [&]() -> std::tuple { if (variant.lhs_pack.has_value()) { return {variant.matmul.get_packed_lhs_offset(output_area.start_row(), shape.k), reference.packed_lhs}; @@ -814,7 +812,6 @@ static Buffer rhs_pack( const KChunk& k_chunk) { // Allocate output buffer const size_t dst_size = variant.get_packed_rhs_size(n, k_chunk.count, k_chunk.length); - Buffer packed_all(dst_size); Buffer packed(dst_size); // Caluclate effective quantization parameters @@ -853,7 +850,7 @@ static Buffer matmul( // Allocate output buffer const size_t dst_size = variant.get_dst_size(shape.m, shape.n); - Buffer dst(dst_size); + Buffer dst(dst_size, 0); // Calculate geffective uantization parameters kai_matmul_requantize32_params requantization{}; diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp index 3c41541c..c828f56d 100644 --- a/test/tests/matmul_test.cpp +++ b/test/tests/matmul_test.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include "kai/kai_common.h" #include "test/common/buffer.hpp" @@ -440,7 +439,7 @@ TEST_P(MatMulTest, PackedLhs) { const auto ref_packed_lhs_offset = method.packed_lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w); ASSERT_EQ(packed_lhs_offset, ref_packed_lhs_offset); - Buffer packed_lhs(packed_lhs_size); + Buffer packed_lhs(packed_lhs_size, 0); method.fn_pack_lhs( rect.height(), rect.width(), mr, kr, sr, 0, data.lhs.data() + lhs_offset, ref_lhs_row_stride, packed_lhs.data() + packed_lhs_offset); @@ -570,7 +569,7 @@ TEST_P(MatMulTest, PackedTransposedRhs) { const auto ref_bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), info.n); ASSERT_EQ(bias_offset, ref_bias_offset); - Buffer packed_rhs(packed_rhs_size); + Buffer packed_rhs(packed_rhs_size, 0); method.pack_rhs_nxk( rect.height(), rect.width(), data.rhs_t.data() + rhs_offset, ref_rhs_row_stride, data.bias.data() + bias_offset, @@ -671,7 +670,7 @@ TEST_P(MatMulTest, Output) { const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n); ASSERT_EQ(dst_size, ref_dst_size); - Buffer dst(dst_size); + Buffer dst(dst_size, 0); method.main_kernel( rect.height(), rect.width(), info.k, lhs_data + lhs_offset, rhs_data + rhs_offset, bias_data + bias_offset, -- GitLab From b417754c7b2ec0e51056363add75e5912344446c Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Mon, 19 May 2025 09:42:52 +0100 Subject: [PATCH 3/3] Address review comments Signed-off-by: Viet-Hoa Do --- test/reference/pad.cpp | 14 ++++---- test/reference/pad.hpp | 2 +- .../matmul_clamp_f32_bf16p_bf16p_test.cpp | 20 ++++++----- test/tests/matmul_clamp_f32_f32_f32p_test.cpp | 35 +------------------ 4 files changed, 21 insertions(+), 50 deletions(-) diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp index e857ce74..4a19555f 100644 --- a/test/reference/pad.cpp +++ b/test/reference/pad.cpp @@ -50,13 +50,15 @@ Buffer pad_matrix( Buffer dst(dst_size); - for (size_t y = 0; y < dst_height; ++y) { - for (size_t x = 0; x < dst_width; ++x) { - if (y >= pad_top && y < pad_top + height && x >= pad_left && x < pad_left + width) { - const T value = read_array(data, (y - pad_top) * width + x - pad_left); - write_array(dst.data(), y * dst_width + x, value); + for (size_t row = 0; row < dst_height; ++row) { + for (size_t col = 0; col < dst_width; ++col) { + const bool valid_row = row >= pad_top && row < pad_top + height; + const bool valid_col = col >= pad_left && col < pad_left + width; + if (valid_row && valid_col) { + const T value = read_array(data, (row - pad_top) * width + col - pad_left); + write_array(dst.data(), row * dst_width + col, value); } else { - write_array(dst.data(), y * dst_width + x, pad_value); + write_array(dst.data(), row * dst_width + col, pad_value); } } } diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp index c21b8c9a..29089919 100644 --- a/test/reference/pad.hpp +++ b/test/reference/pad.hpp @@ -32,7 +32,7 @@ Buffer pad_row( const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size, uint8_t val = 0); -/// Pads the matrix with value. +/// Creates a padded matrix from an input matrix. /// /// @param[in] data The input data buffer. /// @param[in] height The number of input rows. diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp index de0f4ffb..76ea30b7 100644 --- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp +++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp @@ -379,15 +379,18 @@ protected: constexpr size_t nr = 12; constexpr size_t kr = 4; - Buffer packed_rhs; + size_t packed_rhs_size = 0; + if (method.fn_get_packed_rhs_size) { - packed_rhs = Buffer(method.fn_get_packed_rhs_size(rhs_w, rhs_h)); + packed_rhs_size = method.fn_get_packed_rhs_size(rhs_w, rhs_h); } else if (method.fn_get_packed_rhs_size_generic_block_size) { - packed_rhs = Buffer(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr)); + packed_rhs_size = method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr); } else { KAI_ERROR("No function to calculate Packed Rhs Matrix Size"); } + Buffer packed_rhs(packed_rhs_size); + if (has_rhs_pack) { const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w); method.pack_rhs( @@ -475,17 +478,16 @@ TEST_P(MatMulTestBf16, Output) { const auto rhs_stride = method.rhs_format.default_row_stride(info.n); - Buffer rhs_data; + size_t rhs_packed_size = 0; if (method.fn_get_packed_rhs_size_generic_block_size) { - const size_t rhs_packed_size = - method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0); - rhs_data = Buffer(rhs_packed_size); + rhs_packed_size = method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0); } else if (method.fn_get_packed_rhs_size) { - const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k); - rhs_data = Buffer(rhs_packed_size); + rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k); } + Buffer rhs_data(rhs_packed_size); + const auto packed_rhs_start_row = rect.start_col(); const auto packed_rhs_start_col = 0; diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp index 7e811c75..3b37518a 100644 --- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp +++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp @@ -30,6 +30,7 @@ #include "test/common/memory.hpp" #include "test/common/test_suite.hpp" #include "test/reference/clamp.hpp" +#include "test/reference/fill.hpp" #include "test/reference/matmul.hpp" namespace kai::test { @@ -63,40 +64,6 @@ const std::array, 2> ukern "matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla", cpu_has_sme2}}}; -// TODO: Reimplement these helpers in fill.cpp. These methods are currently duplicated here so they can be specialized -// on the Buffer return type. -template -Buffer fill_matrix_raw(size_t height, size_t width, std::function gen) { - const auto size = height * width * size_in_bits / 8; - KAI_ASSUME(width * size_in_bits % 8 == 0); - - Buffer data(size); - auto ptr = reinterpret_cast(data.data()); - - for (size_t y = 0; y < height; ++y) { - for (size_t x = 0; x < width; ++x) { - write_array(ptr, y * width + x, gen(y, x)); - } - } - - return data; -} - -template -Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) { - using TDist = std::conditional_t< - std::is_floating_point_v, std::uniform_real_distribution, std::uniform_int_distribution>; - - std::mt19937 rnd(seed); - TDist dist; - - return fill_matrix_raw(height, width, [&](size_t, size_t) { return dist(rnd); }); -} - -template -Buffer fill_random(size_t length, uint32_t seed) { - return fill_matrix_random_raw(1, length, seed); -} } // namespace class MatMulTest_f32_f32_f32p : public ::testing::TestWithParam {}; -- GitLab