From d770c069c489d078b605116b9029678bb7ddc506 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Tue, 6 May 2025 17:31:24 +0100
Subject: [PATCH 1/3] Use new Buffer class for the entire test framework

* Replace `std::vector<uint8_t>` by `Buffer` class.
* Update `Buffer` class:
  - Add support for initial value of the buffer.
  - Always initialize the buffer with 0 by default.
* Add `pad_matrix` reference function to support extending
  the data buffer.

Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
---
 test/common/buffer.cpp                        |  4 +-
 test/common/buffer.hpp                        |  7 +-
 test/common/int4.cpp                          |  5 +-
 test/common/int4.hpp                          |  4 +-
 test/reference/binary_elementwise.cpp         | 25 ++++---
 test/reference/binary_elementwise.hpp         | 15 ++--
 test/reference/cast.cpp                       | 17 +++--
 test/reference/cast.hpp                       |  9 ++-
 test/reference/clamp.cpp                      | 13 ++--
 test/reference/clamp.hpp                      |  5 +-
 test/reference/fill.cpp                       | 24 +++---
 test/reference/fill.hpp                       |  8 +-
 test/reference/matmul.cpp                     | 74 +++++++++----------
 test/reference/matmul.hpp                     | 15 ++--
 test/reference/matmul_pack.cpp                | 12 ++-
 test/reference/matmul_pack.hpp                |  6 +-
 test/reference/pack.cpp                       | 42 +++++------
 test/reference/pack.hpp                       | 15 ++--
 test/reference/pad.cpp                        | 44 +++++++++--
 test/reference/pad.hpp                        | 22 +++++-
 test/reference/quantize.cpp                   | 69 +++++++++--------
 test/reference/quantize.hpp                   | 17 +++--
 test/reference/reduce.cpp                     | 33 ++++-----
 test/reference/reduce.hpp                     |  6 +-
 test/reference/reorder.cpp                    | 10 +--
 test/reference/reorder.hpp                    |  7 +-
 test/reference/transpose.cpp                  | 24 +++---
 test/reference/transpose.hpp                  |  9 ++-
 test/tests/buffer_test.cpp                    |  6 +-
 test/tests/imatmul_test.cpp                   |  4 +-
 .../matmul_clamp_f16_bf16p_bf16p_test.cpp     | 33 ++++-----
 .../matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp | 11 +--
 .../matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp |  9 ++-
 ...atmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp | 15 ++--
 .../matmul_clamp_f32_bf16p_bf16p_test.cpp     | 39 +++++-----
 test/tests/matmul_clamp_f32_f32_f32p_test.cpp |  4 +-
 ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 19 +++--
 .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 41 +++++-----
 .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp | 13 ++--
 ...atmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp | 15 ++--
 ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp | 10 ++-
 .../matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp  | 18 +++--
 test/tests/matmul_test.cpp                    | 52 +++++++------
 43 files changed, 456 insertions(+), 374 deletions(-)

diff --git a/test/common/buffer.cpp b/test/common/buffer.cpp
index 65c9e261..3945eba2 100644
--- a/test/common/buffer.cpp
+++ b/test/common/buffer.cpp
@@ -21,7 +21,7 @@
 
 namespace kai::test {
 
-Buffer::Buffer(const size_t size) : m_user_buffer_size(size) {
+Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size) {
     KAI_ASSUME_MSG(size > 0, "Buffers must be of non-zero size");
 
     const char* val = getenv("KAI_TEST_BUFFER_POLICY");
@@ -57,6 +57,8 @@ Buffer::Buffer(const size_t size) : m_user_buffer_size(size) {
         default:
             allocate();
     }
+
+    memset(data(), init_value, size);
 }
 
 void Buffer::allocate() {
diff --git a/test/common/buffer.hpp b/test/common/buffer.hpp
index a2226f53..4cb06e4b 100644
--- a/test/common/buffer.hpp
+++ b/test/common/buffer.hpp
@@ -29,7 +29,8 @@ class Buffer {
     using handle = std::unique_ptr<void, std::function<void(void*)>>;
 
 public:
-    explicit Buffer(size_t size);
+    Buffer() = default;
+    Buffer(size_t size, uint8_t init_value = 0);
 
     Buffer(const Buffer& other) = delete;
     Buffer(Buffer&& other) noexcept = default;
@@ -41,7 +42,7 @@ public:
     /// Gets the base memory address of the user buffer.
     ///
     /// @return Base memory address of the user buffer.
-    [[nodiscard]] void* data() const {
+    [[nodiscard]] std::byte* data() const {
         return static_cast<std::byte*>(m_buffer.get()) + m_user_buffer_offset;
     }
 
@@ -81,7 +82,7 @@ private:
 
     handle m_buffer = nullptr;
 
-    size_t m_user_buffer_size;
+    size_t m_user_buffer_size = 0;
     size_t m_user_buffer_offset = 0;
 
     BufferProtectionPolicy m_protection_policy = BufferProtectionPolicy::None;
diff --git a/test/common/int4.cpp b/test/common/int4.cpp
index ff64de20..f0e1a56d 100644
--- a/test/common/int4.cpp
+++ b/test/common/int4.cpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/memory.hpp"
 
 namespace kai::test {
@@ -115,9 +116,9 @@ std::tuple<Int4, Int4> Int4::unpack_u8(uint8_t value) {
 
 // =====================================================================================================================
 
-std::vector<uint8_t> convert_s0s1_s1s0(const std::vector<uint8_t>& src) {
+Buffer convert_s0s1_s1s0(const Buffer& src) {
     const auto length = src.size();
-    std::vector<uint8_t> dst(length);
+    Buffer dst(length);
 
     for (size_t i = 0; i < length; ++i) {
         uint8_t val = read_array<uint8_t>(src.data(), i);
diff --git a/test/common/int4.hpp b/test/common/int4.hpp
index 1d9ba8b5..aa05d9bd 100644
--- a/test/common/int4.hpp
+++ b/test/common/int4.hpp
@@ -10,6 +10,8 @@
 #include <tuple>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 /// 4-bit unsigned integer.
@@ -121,6 +123,6 @@ private:
 /// @param[in] src The data buffer.
 ///
 /// @return The buffer with packed byte, where the high and low nibbles reversed.
-std::vector<uint8_t> convert_s0s1_s1s0(const std::vector<uint8_t>& src);
+Buffer convert_s0s1_s1s0(const Buffer& src);
 
 }  // namespace kai::test
diff --git a/test/reference/binary_elementwise.cpp b/test/reference/binary_elementwise.cpp
index 803d87fb..48434e47 100644
--- a/test/reference/binary_elementwise.cpp
+++ b/test/reference/binary_elementwise.cpp
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/float16.hpp"
 #include "test/common/int4.hpp"
@@ -67,13 +68,13 @@ T scalar_binary_elementwise(T lhs, T rhs) {
 ///
 /// @return The result data buffer.
 template <const BinaryElementwiseOperator op, typename T>
-std::vector<uint8_t> binary_elementwise_any_op_type(
+Buffer binary_elementwise_any_op_type(
     const void* lhs, const void* rhs, size_t lhs_height, size_t lhs_width, size_t rhs_height, size_t rhs_width) {
     const auto height = std::max(lhs_height, rhs_height);
     const auto width = std::max(lhs_width, rhs_width);
 
     KAI_ASSUME(width * size_in_bits<T> % 8 == 0);
-    std::vector<uint8_t> dst(height * width * size_in_bits<T> / 8);
+    Buffer dst(height * width * size_in_bits<T> / 8);
 
     for (size_t y = 0; y < height; ++y) {
         for (size_t x = 0; x < width; ++x) {
@@ -94,7 +95,7 @@ std::vector<uint8_t> binary_elementwise_any_op_type(
 }
 
 template <const BinaryElementwiseOperator op>
-std::vector<uint8_t> binary_elementwise_any_type(
+Buffer binary_elementwise_any_type(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) {
     KAI_ASSUME(lhs_dt == rhs_dt);
@@ -121,14 +122,14 @@ std::vector<uint8_t> binary_elementwise_any_type(
 
 }  // namespace
 
-std::vector<uint8_t> add(
+Buffer add(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_type<BinaryElementwiseOperator::ADD>(
         lhs, lhs_dt, lhs_height, lhs_width, rhs, rhs_dt, rhs_height, rhs_width);
 }
 
-std::vector<uint8_t> sub(
+Buffer sub(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_type<BinaryElementwiseOperator::SUB>(
@@ -136,18 +137,18 @@ std::vector<uint8_t> sub(
 }
 
 template <typename T>
-std::vector<uint8_t> sub(
+Buffer sub(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_op_type<BinaryElementwiseOperator::SUB, T>(
         lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width);
 }
 
-template std::vector<uint8_t> sub<int32_t>(
+template Buffer sub<int32_t>(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width);
 
-std::vector<uint8_t> mul(
+Buffer mul(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_type<BinaryElementwiseOperator::MUL>(
@@ -155,22 +156,22 @@ std::vector<uint8_t> mul(
 }
 
 template <typename T>
-std::vector<uint8_t> mul(
+Buffer mul(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_op_type<BinaryElementwiseOperator::MUL, T>(
         lhs, rhs, lhs_height, lhs_width, rhs_height, rhs_width);
 }
 
-template std::vector<uint8_t> mul<float>(
+template Buffer mul<float>(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width);
 
-template std::vector<uint8_t> mul<int32_t>(
+template Buffer mul<int32_t>(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width);
 
-std::vector<uint8_t> div(
+Buffer div(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width) {
     return binary_elementwise_any_type<BinaryElementwiseOperator::DIV>(
diff --git a/test/reference/binary_elementwise.hpp b/test/reference/binary_elementwise.hpp
index f2f5c0ab..713f8692 100644
--- a/test/reference/binary_elementwise.hpp
+++ b/test/reference/binary_elementwise.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -28,7 +29,7 @@ namespace kai::test {
 /// @param[in] rhs_width RHS width.
 ///
 /// @return The result matrix.
-std::vector<uint8_t> add(
+Buffer add(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width);
 
@@ -46,7 +47,7 @@ std::vector<uint8_t> add(
 /// @param[in] rhs_width RHS width.
 ///
 /// @return The result matrix.
-std::vector<uint8_t> sub(
+Buffer sub(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width);
 
@@ -65,7 +66,7 @@ std::vector<uint8_t> sub(
 ///
 /// @return The result matrix.
 template <typename T>
-std::vector<uint8_t> sub(
+Buffer sub(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width);
 
@@ -83,7 +84,7 @@ std::vector<uint8_t> sub(
 /// @param[in] rhs_width RHS width.
 ///
 /// @return The result matrix.
-std::vector<uint8_t> mul(
+Buffer mul(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width);
 
@@ -102,7 +103,7 @@ std::vector<uint8_t> mul(
 ///
 /// @return The result matrix.
 template <typename T>
-std::vector<uint8_t> mul(
+Buffer mul(
     const void* lhs, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, size_t rhs_height, size_t rhs_width);
 
@@ -120,7 +121,7 @@ std::vector<uint8_t> mul(
 /// @param[in] rhs_width RHS width.
 ///
 /// @return The result matrix.
-std::vector<uint8_t> div(
+Buffer div(
     const void* lhs, DataType lhs_dt, size_t lhs_height, size_t lhs_width,  //
     const void* rhs, DataType rhs_dt, size_t rhs_height, size_t rhs_width);
 
diff --git a/test/reference/cast.cpp b/test/reference/cast.cpp
index 95a63a32..e11cb350 100644
--- a/test/reference/cast.cpp
+++ b/test/reference/cast.cpp
@@ -12,6 +12,7 @@
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/float16.hpp"
 #include "test/common/memory.hpp"
@@ -20,8 +21,8 @@
 namespace kai::test {
 
 template <typename DstType, typename SrcType>
-std::vector<uint8_t> cast(const void* src, size_t length) {
-    std::vector<uint8_t> dst(round_up_division(length * size_in_bits<DstType>, 8));
+Buffer cast(const void* src, size_t length) {
+    Buffer dst(round_up_division(length * size_in_bits<DstType>, 8));
 
     for (size_t i = 0; i < length; ++i) {
         write_array(dst.data(), i, static_cast<DstType>(read_array<SrcType>(src, i)));
@@ -30,11 +31,11 @@ std::vector<uint8_t> cast(const void* src, size_t length) {
     return dst;
 }
 
-template std::vector<uint8_t> cast<Float16, float>(const void* src, size_t length);
-template std::vector<uint8_t> cast<BFloat16, float>(const void* src, size_t length);
-template std::vector<uint8_t> cast<float, Float16>(const void* src, size_t length);
+template Buffer cast<Float16, float>(const void* src, size_t length);
+template Buffer cast<BFloat16, float>(const void* src, size_t length);
+template Buffer cast<float, Float16>(const void* src, size_t length);
 
-std::vector<uint8_t> cast(const void* src, kai::test::DataType src_dt, DataType dst_dt, size_t height, size_t width) {
+Buffer cast(const void* src, kai::test::DataType src_dt, DataType dst_dt, size_t height, size_t width) {
     const auto length = height * width;
 
     if (src_dt == DataType::BF16 && dst_dt == DataType::FP32) {
@@ -46,8 +47,8 @@ std::vector<uint8_t> cast(const void* src, kai::test::DataType src_dt, DataType
     KAI_ERROR("Unsupported cast data type!");
 }
 
-std::vector<uint8_t> cast_qsu4_qsi4(const void* src, size_t length) {
-    std::vector<uint8_t> dst(round_up_division(length, 2));
+Buffer cast_qsu4_qsi4(const void* src, size_t length) {
+    Buffer dst(round_up_division(length, 2));
 
     for (size_t i = 0; i < length; ++i) {
         write_array(dst.data(), i, static_cast<UInt4>(static_cast<int32_t>(read_array<Int4>(src, i)) + 8));
diff --git a/test/reference/cast.hpp b/test/reference/cast.hpp
index 744d5a3c..8dc09b22 100644
--- a/test/reference/cast.hpp
+++ b/test/reference/cast.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -24,7 +25,7 @@ namespace kai::test {
 ///
 /// @return A new data buffer containing casted values.
 template <typename DstType, typename SrcType>
-std::vector<uint8_t> cast(const void* src, size_t length);
+Buffer cast(const void* src, size_t length);
 
 /// Converts each element of the source matrix to the new data type.
 ///
@@ -35,7 +36,7 @@ std::vector<uint8_t> cast(const void* src, size_t length);
 /// @param[in] width Number of columns.
 ///
 /// @return The result matrix containing data in the destination data type.
-std::vector<uint8_t> cast(const void* src, DataType src_dt, DataType dst_dt, size_t height, size_t width);
+Buffer cast(const void* src, DataType src_dt, DataType dst_dt, size_t height, size_t width);
 
 /// Converts each element of the source data from 4-bit signed symmetric quantized
 /// to 4-bit unsigned symmetric quantized.
@@ -44,6 +45,6 @@ std::vector<uint8_t> cast(const void* src, DataType src_dt, DataType dst_dt, siz
 /// @param[in] length The number of elements.
 ///
 /// @return A new data buffer with converted values.
-std::vector<uint8_t> cast_qsu4_qsi4(const void* src, size_t length);
+Buffer cast_qsu4_qsi4(const void* src, size_t length);
 
 }  // namespace kai::test
diff --git a/test/reference/clamp.cpp b/test/reference/clamp.cpp
index ab2e77e9..eadc755e 100644
--- a/test/reference/clamp.cpp
+++ b/test/reference/clamp.cpp
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/float16.hpp"
 #include "test/common/memory.hpp"
 #include "test/common/numeric_limits.hpp"
@@ -64,8 +65,8 @@ std::tuple<float, float> find_clamp_range(DataType type, const void* src, size_t
 }
 
 template <typename T>
-std::vector<uint8_t> clamp(const void* src, size_t len, T min_value, T max_value) {
-    std::vector<uint8_t> dst(round_up_division(len * size_in_bits<T>, 8));
+Buffer clamp(const void* src, size_t len, T min_value, T max_value) {
+    Buffer dst(round_up_division(len * size_in_bits<T>, 8));
 
     for (size_t i = 0; i < len; ++i) {
         write_array<T>(dst.data(), i, std::clamp(read_array<T>(src, i), min_value, max_value));
@@ -74,11 +75,11 @@ std::vector<uint8_t> clamp(const void* src, size_t len, T min_value, T max_value
     return dst;
 }
 
-template std::vector<uint8_t> clamp(const void* src, size_t len, float min_value, float max_value);
-template std::vector<uint8_t> clamp(const void* src, size_t len, Float16 min_value, Float16 max_value);
+template Buffer clamp(const void* src, size_t len, float min_value, float max_value);
+template Buffer clamp(const void* src, size_t len, Float16 min_value, Float16 max_value);
 
-std::vector<uint8_t> clamp(DataType type, const void* src, size_t len, float min_value, float max_value) {
-    std::vector<uint8_t> dst(round_up_division(len * data_type_size_in_bits(type), 8));
+Buffer clamp(DataType type, const void* src, size_t len, float min_value, float max_value) {
+    Buffer dst(round_up_division(len * data_type_size_in_bits(type), 8));
 
     for (size_t i = 0; i < len; ++i) {
         write_array(type, dst.data(), i, std::clamp<float>(read_array(type, src, i), min_value, max_value));
diff --git a/test/reference/clamp.hpp b/test/reference/clamp.hpp
index 532e7d25..52ca57ba 100644
--- a/test/reference/clamp.hpp
+++ b/test/reference/clamp.hpp
@@ -11,6 +11,7 @@
 #include <tuple>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -42,7 +43,7 @@ std::tuple<float, float> find_clamp_range(DataType type, const void* src, size_t
 /// @param[in] min_value Lower bound of clamp.
 /// @param[in] width Upper bound of clamp.
 template <typename T>
-std::vector<uint8_t> clamp(const void* src, size_t len, T min_value, T max_value);
+Buffer clamp(const void* src, size_t len, T min_value, T max_value);
 
 /// Clamps the matrix.
 ///
@@ -51,5 +52,5 @@ std::vector<uint8_t> clamp(const void* src, size_t len, T min_value, T max_value
 /// @param[in] len Number of values in the source matrix.
 /// @param[in] min_value Lower bound of clamp.
 /// @param[in] max_value Upper bound of clamp.
-std::vector<uint8_t> clamp(DataType type, const void* src, size_t len, float min_value, float max_value);
+Buffer clamp(DataType type, const void* src, size_t len, float min_value, float max_value);
 }  // namespace kai::test
diff --git a/test/reference/fill.cpp b/test/reference/fill.cpp
index 459055a1..82068fdb 100644
--- a/test/reference/fill.cpp
+++ b/test/reference/fill.cpp
@@ -15,6 +15,7 @@
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
+#include "test/common/buffer.hpp"
 #include "test/common/data_format.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/float16.hpp"
@@ -26,12 +27,11 @@ namespace kai::test {
 namespace {
 
 template <typename T>
-std::vector<uint8_t> fill_matrix_raw(size_t height, size_t width, std::function<T(size_t, size_t)> gen) {
+Buffer fill_matrix_raw(size_t height, size_t width, std::function<T(size_t, size_t)> gen) {
     const auto size = height * width * size_in_bits<T> / 8;
     KAI_ASSUME(width * size_in_bits<T> % 8 == 0);
 
-    std::vector<uint8_t> data;
-    data.resize(size);
+    Buffer data(size);
     auto ptr = reinterpret_cast<T*>(data.data());
 
     for (size_t y = 0; y < height; ++y) {
@@ -44,7 +44,7 @@ std::vector<uint8_t> fill_matrix_raw(size_t height, size_t width, std::function<
 }
 
 template <typename T>
-std::vector<uint8_t> fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) {
+Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) {
     using TDist = std::conditional_t<
         std::is_floating_point_v<T>, std::uniform_real_distribution<float>, std::uniform_int_distribution<T>>;
 
@@ -55,7 +55,7 @@ std::vector<uint8_t> fill_matrix_random_raw(size_t height, size_t width, uint32_
 }
 
 template <>
-std::vector<uint8_t> fill_matrix_random_raw<Float16>(size_t height, size_t width, uint32_t seed) {
+Buffer fill_matrix_random_raw<Float16>(size_t height, size_t width, uint32_t seed) {
     std::mt19937 rnd(seed);
     std::uniform_real_distribution<float> dist;
 
@@ -63,7 +63,7 @@ std::vector<uint8_t> fill_matrix_random_raw<Float16>(size_t height, size_t width
 }
 
 template <>
-std::vector<uint8_t> fill_matrix_random_raw<BFloat16>(size_t height, size_t width, uint32_t seed) {
+Buffer fill_matrix_random_raw<BFloat16>(size_t height, size_t width, uint32_t seed) {
     std::mt19937 rnd(seed);
     std::uniform_real_distribution<float> dist;
 
@@ -71,7 +71,7 @@ std::vector<uint8_t> fill_matrix_random_raw<BFloat16>(size_t height, size_t widt
 }
 
 template <>
-std::vector<uint8_t> fill_matrix_random_raw<Int4>(size_t height, size_t width, uint32_t seed) {
+Buffer fill_matrix_random_raw<Int4>(size_t height, size_t width, uint32_t seed) {
     std::mt19937 rnd(seed);
     std::uniform_int_distribution<int16_t> dist(-8, 7);
 
@@ -79,7 +79,7 @@ std::vector<uint8_t> fill_matrix_random_raw<Int4>(size_t height, size_t width, u
 }
 
 template <>
-std::vector<uint8_t> fill_matrix_random_raw<UInt4>(size_t height, size_t width, uint32_t seed) {
+Buffer fill_matrix_random_raw<UInt4>(size_t height, size_t width, uint32_t seed) {
     std::mt19937 rnd(seed);
     std::uniform_int_distribution<int16_t> dist(0, 15);
 
@@ -88,7 +88,7 @@ std::vector<uint8_t> fill_matrix_random_raw<UInt4>(size_t height, size_t width,
 
 }  // namespace
 
-std::vector<uint8_t> fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed) {
+Buffer fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed) {
     switch (format.pack_format()) {
         case DataFormat::PackFormat::NONE:
             switch (format.data_type()) {
@@ -119,11 +119,11 @@ std::vector<uint8_t> fill_matrix_random(size_t height, size_t width, const DataF
 }
 
 template <typename Value>
-std::vector<uint8_t> fill_random(size_t length, uint32_t seed) {
+Buffer fill_random(size_t length, uint32_t seed) {
     return fill_matrix_random_raw<Value>(1, length, seed);
 }
 
-template std::vector<uint8_t> fill_random<float>(size_t length, uint32_t seed);
-template std::vector<uint8_t> fill_random<Float16>(size_t length, uint32_t seed);
+template Buffer fill_random<float>(size_t length, uint32_t seed);
+template Buffer fill_random<Float16>(size_t length, uint32_t seed);
 
 }  // namespace kai::test
diff --git a/test/reference/fill.hpp b/test/reference/fill.hpp
index 25846c6c..9dd0f26c 100644
--- a/test/reference/fill.hpp
+++ b/test/reference/fill.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 class DataFormat;
@@ -22,7 +24,7 @@ class DataFormat;
 /// @param[in] seed Random seed.
 ///
 /// @return The data buffer for the matrix.
-std::vector<uint8_t> fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed);
+Buffer fill_matrix_random(size_t height, size_t width, const DataFormat& format, uint32_t seed);
 
 /// Creates a new data buffer filled with random data.
 ///
@@ -33,6 +35,6 @@ std::vector<uint8_t> fill_matrix_random(size_t height, size_t width, const DataF
 ///
 /// @return The data buffer.
 template <typename Value>
-std::vector<uint8_t> fill_random(size_t length, uint32_t seed);
+Buffer fill_random(size_t length, uint32_t seed);
 
 }  // namespace kai::test
diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp
index b1378c75..be5bb6a6 100644
--- a/test/reference/matmul.cpp
+++ b/test/reference/matmul.cpp
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/data_format.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/float16.hpp"
@@ -42,7 +43,7 @@ namespace {
 ///
 /// @return The result data buffer.
 template <typename T>
-std::vector<uint8_t> matmul_any_type(
+Buffer matmul_any_type(
     const void* lhs, const void* rhs,  //
     size_t m, size_t n, size_t k,      //
     bool lhs_transposed, bool rhs_transposed) {
@@ -52,8 +53,7 @@ std::vector<uint8_t> matmul_any_type(
     const auto rhs_n_stride = rhs_transposed ? k : 1;
     const auto rhs_k_stride = rhs_transposed ? 1 : n;
 
-    std::vector<uint8_t> dst;
-    dst.resize(m * n * size_in_bits<T> / 8);
+    Buffer dst(m * n * size_in_bits<T> / 8);
     KAI_ASSUME(n * size_in_bits<T> % 8 == 0);
 
     for (size_t im = 0; im < m; ++im) {
@@ -75,7 +75,7 @@ std::vector<uint8_t> matmul_any_type(
 
 }  // namespace
 
-std::vector<uint8_t> matmul_pack_rhs(
+Buffer matmul_pack_rhs(
     const void* data, const void* scales, const void* zero_points, const DataFormat& src_format,
     const DataFormat& dst_format, size_t n, size_t k, bool transposing) {
     const auto src_dt = src_format.data_type();
@@ -84,9 +84,9 @@ std::vector<uint8_t> matmul_pack_rhs(
     const auto dst_dt = dst_format.data_type();
     const auto dst_pf = dst_format.pack_format();
 
-    std::vector<uint8_t> tmp_data;
-    std::vector<uint8_t> tmp_scales;
-    std::vector<uint8_t> tmp_zero_points;
+    Buffer tmp_data;
+    Buffer tmp_scales;
+    Buffer tmp_zero_points;
 
     if (transposing) {
         tmp_data = transpose(data, src_dt, k, n);
@@ -125,7 +125,7 @@ std::vector<uint8_t> matmul_pack_rhs(
     return pack(dst_format, data, scales, zero_points, src_format, n, k);
 }
 
-std::vector<uint8_t> matmul(
+Buffer matmul(
     const void* lhs, [[maybe_unused]] const void* lhs_scales, [[maybe_unused]] const void* lhs_zero_points,
     DataType lhs_dt,  //
     const void* rhs, [[maybe_unused]] const void* rhs_scales, [[maybe_unused]] const void* rhs_zero_points,
@@ -140,10 +140,10 @@ std::vector<uint8_t> matmul(
     const auto rhs_h = rhs_transposed ? n : k;
     const auto rhs_w = rhs_transposed ? k : n;
 
-    std::vector<uint8_t> tmp_lhs;
-    std::vector<uint8_t> tmp_rhs;
-    std::vector<uint8_t> tmp_dst;
-    std::vector<uint8_t> tmp_bias;
+    Buffer tmp_lhs;
+    Buffer tmp_rhs;
+    Buffer tmp_dst;
+    Buffer tmp_bias;
 
     if (lhs_dt != dst_dt) {
         tmp_lhs = cast(lhs, lhs_dt, dst_dt, lhs_h, lhs_w);
@@ -184,7 +184,7 @@ std::vector<uint8_t> matmul(
     return tmp_dst;
 }
 
-std::vector<uint8_t> indirect_matmul(
+Buffer indirect_matmul(
     const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales,
     const void* lhs_zero_points,
     DataType lhs_dt,  //
@@ -196,7 +196,7 @@ std::vector<uint8_t> indirect_matmul(
     // This is inefficient, but allows code-reuse
     const size_t chunk_bytes = k_chunk_length * round_up_division(data_type_size_in_bits(lhs_dt), 8);
     const size_t n_chunks = m * k_chunk_count;
-    std::vector<uint8_t> lhs(n_chunks * chunk_bytes);
+    Buffer lhs(n_chunks * chunk_bytes);
 
     // Copy all chunks to the created matrix
     for (size_t i = 0; i < n_chunks; i += 1) {
@@ -217,7 +217,7 @@ std::vector<uint8_t> indirect_matmul(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
-std::vector<uint8_t> indirect_matmul_nt_t_quantized(
+Buffer indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
     const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales,
     const void* lhs_zero_points, size_t lhs_quant_height,
@@ -228,7 +228,7 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
     const auto lhs_num_quant_per_row = round_up_division(k_chunk_count * k_chunk_length, lhs_quant_width);
     const auto rhs_num_quant_per_row = round_up_division(k_chunk_count * k_chunk_length, rhs_quant_width);
 
-    std::vector<uint8_t> dst(m * n * sizeof(DstData));
+    Buffer dst(m * n * sizeof(DstData));
 
     for (size_t i_m = 0; i_m < m; ++i_m) {
         for (size_t i_n = 0; i_n < n; ++i_n) {
@@ -293,7 +293,7 @@ std::vector<uint8_t> indirect_matmul_nt_t_quantized(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
-std::vector<uint8_t> matmul_nt_t_quantized(
+Buffer matmul_nt_t_quantized(
     size_t m, size_t n, size_t k,                                                  //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points,     //
     size_t lhs_quant_height, size_t lhs_quant_width,                               //
@@ -304,7 +304,7 @@ std::vector<uint8_t> matmul_nt_t_quantized(
     const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width);
     const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width);
 
-    std::vector<uint8_t> dst(m * n * sizeof(DstData));
+    Buffer dst(m * n * sizeof(DstData));
 
     for (size_t row = 0; row < m; ++row) {
         for (size_t col = 0; col < n; ++col) {
@@ -355,8 +355,7 @@ std::vector<uint8_t> matmul_nt_t_quantized(
     return dst;
 }
 
-template std::vector<uint8_t>
-matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
+template Buffer matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,  //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
@@ -364,8 +363,7 @@ matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, f
     size_t rhs_quant_width,  //
     const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width);
 
-template std::vector<uint8_t>
-matmul_nt_t_quantized<int8_t, float, int32_t, Int4, float, int32_t, float, float, int32_t, float>(
+template Buffer matmul_nt_t_quantized<int8_t, float, int32_t, Int4, float, int32_t, float, float, int32_t, float>(
     size_t m, size_t n, size_t k,  //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
@@ -373,8 +371,7 @@ matmul_nt_t_quantized<int8_t, float, int32_t, Int4, float, int32_t, float, float
     size_t rhs_quant_width,  //
     const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width);
 
-template std::vector<uint8_t>
-matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, float, float, int32_t, float>(
+template Buffer matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, float, float, int32_t, float>(
     size_t m, size_t n, size_t k,  //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
@@ -382,7 +379,7 @@ matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, float, flo
     size_t rhs_quant_width,  //
     const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width);
 
-template std::vector<uint8_t>
+template Buffer
 indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
     const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding, const void* lhs_scales,
@@ -395,7 +392,7 @@ indirect_matmul_nt_t_quantized<int8_t, float, int32_t, int8_t, float, int32_t, i
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData>
-std::vector<uint8_t> matmul_clamp_nt_t(
+Buffer matmul_clamp_nt_t(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
@@ -404,7 +401,7 @@ std::vector<uint8_t> matmul_clamp_nt_t(
     const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width);
     const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width);
 
-    std::vector<uint8_t> dst(m * n * sizeof(DstData));
+    Buffer dst(m * n * sizeof(DstData));
 
     const auto* lhs_scales_ptr = reinterpret_cast<const LhsScale*>(lhs_scales);
     const auto* rhs_scales_ptr = reinterpret_cast<const RhsScale*>(rhs_scales);
@@ -448,29 +445,28 @@ std::vector<uint8_t> matmul_clamp_nt_t(
     return dst;
 }
 
-template std::vector<uint8_t> matmul_clamp_nt_t<int8_t, float, int32_t, Int4, float, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_t<int8_t, float, int32_t, Int4, float, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
 
-template std::vector<uint8_t>
-matmul_clamp_nt_t<int8_t, Float16, int32_t, Int4, Float16, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_t<int8_t, Float16, int32_t, Int4, Float16, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
 
-template std::vector<uint8_t> matmul_clamp_nt_t<int8_t, float, int32_t, Int4, BFloat16, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_t<int8_t, float, int32_t, Int4, BFloat16, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
 
-template std::vector<uint8_t> matmul_clamp_nt_t<int8_t, float, int32_t, int8_t, float, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_t<int8_t, float, int32_t, int8_t, float, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
@@ -480,7 +476,7 @@ template std::vector<uint8_t> matmul_clamp_nt_t<int8_t, float, int32_t, int8_t,
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData>
-std::vector<uint8_t> matmul_clamp_nt_nt(
+Buffer matmul_clamp_nt_nt(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
@@ -489,7 +485,7 @@ std::vector<uint8_t> matmul_clamp_nt_nt(
     const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width);
     const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width);
 
-    std::vector<uint8_t> dst(m * n * sizeof(DstData));
+    Buffer dst(m * n * sizeof(DstData));
 
     const auto* lhs_scales_ptr = reinterpret_cast<const LhsScale*>(lhs_scales);
     const auto* rhs_scales_ptr = reinterpret_cast<const RhsScale*>(rhs_scales);
@@ -533,29 +529,27 @@ std::vector<uint8_t> matmul_clamp_nt_nt(
     return dst;
 }
 
-template std::vector<uint8_t> matmul_clamp_nt_nt<int8_t, float, int32_t, int8_t, float, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_nt<int8_t, float, int32_t, int8_t, float, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
-template std::vector<uint8_t> matmul_clamp_nt_nt<int8_t, float, int32_t, Int4, float, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_nt<int8_t, float, int32_t, Int4, float, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
 
-template std::vector<uint8_t>
-matmul_clamp_nt_nt<int8_t, Float16, int32_t, Int4, Float16, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_nt<int8_t, Float16, int32_t, Int4, Float16, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
     const void* biases,                                                                                 //
     float min_value, float max_value);
 
-template std::vector<uint8_t>
-matmul_clamp_nt_nt<int8_t, float, int32_t, Int4, BFloat16, int32_t, float, int32_t, float>(
+template Buffer matmul_clamp_nt_nt<int8_t, float, int32_t, Int4, BFloat16, int32_t, float, int32_t, float>(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp
index 343b8d34..d349d0b4 100644
--- a/test/reference/matmul.hpp
+++ b/test/reference/matmul.hpp
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -28,7 +29,7 @@ class DataFormat;
 /// @param[in] transposing Perform transpose then pack.
 ///
 /// @return The packed RHS matrix.
-std::vector<uint8_t> matmul_pack_rhs(
+Buffer matmul_pack_rhs(
     const void* data, const void* scales, const void* zero_points, const DataFormat& src_format,
     const DataFormat& dst_format, size_t n, size_t k, bool transposing);
 
@@ -57,7 +58,7 @@ std::vector<uint8_t> matmul_pack_rhs(
 /// @param[in] rhs_transposed `true` if RHS operand is transposed.
 ///
 /// @return The result data buffer.
-std::vector<uint8_t> matmul(
+Buffer matmul(
     const void* lhs, const void* lhs_scales, const void* lhs_zero_points, DataType lhs_dt,      //
     const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt,      //
     const void* bias, const void* bias_scales, const void* bias_zero_points, DataType bias_dt,  //
@@ -88,7 +89,7 @@ std::vector<uint8_t> matmul(
 /// @param[in] k_chunk_size Number of elements in each LHS K chunk
 ///
 /// @return The result data buffer.
-std::vector<uint8_t> indirect_matmul(
+Buffer indirect_matmul(
     const void* const* lhs_idata, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales,
     const void* lhs_zero_points, DataType lhs_dt,                                               //
     const void* rhs, const void* rhs_scales, const void* rhs_zero_points, DataType rhs_dt,      //
@@ -129,7 +130,7 @@ std::vector<uint8_t> indirect_matmul(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData>
-std::vector<uint8_t> matmul_clamp_nt_t(
+Buffer matmul_clamp_nt_t(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
@@ -174,7 +175,7 @@ std::vector<uint8_t> matmul_clamp_nt_t(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename Bias, typename IntAcc, typename DstData>
-std::vector<uint8_t> matmul_clamp_nt_nt(
+Buffer matmul_clamp_nt_nt(
     size_t m, size_t n, size_t k,                                                                       //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_width,  //
     const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_width,  //
@@ -184,7 +185,7 @@ std::vector<uint8_t> matmul_clamp_nt_nt(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
-std::vector<uint8_t> matmul_nt_t_quantized(
+Buffer matmul_nt_t_quantized(
     size_t m, size_t n, size_t k,  //
     const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height,
     size_t lhs_quant_width,  //
@@ -195,7 +196,7 @@ std::vector<uint8_t> matmul_nt_t_quantized(
 template <
     typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale,
     typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData>
-std::vector<uint8_t> indirect_matmul_nt_t_quantized(
+Buffer indirect_matmul_nt_t_quantized(
     size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length,  //
     const void* const* lhs_ptrs, uintptr_t lhs_offset, const void* lhs_padding_ptr, const void* lhs_scales,
     const void* lhs_zero_points, size_t lhs_quant_height,
diff --git a/test/reference/matmul_pack.cpp b/test/reference/matmul_pack.cpp
index 55e916a0..40139e09 100644
--- a/test/reference/matmul_pack.cpp
+++ b/test/reference/matmul_pack.cpp
@@ -10,16 +10,18 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/round.hpp"
 #include "test/reference/binary_elementwise.hpp"
 #include "test/reference/pack.hpp"
+#include "test/reference/pad.hpp"
 #include "test/reference/reduce.hpp"
 #include "test/reference/reorder.hpp"
 
 namespace kai::test {
 
 template <typename Data, typename Scale, typename ZeroPoint>
-std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized(
+Buffer matmul_pack_rhs_nxk_static_quantized(
     const void* data, const void* scales, Scale lhs_scale, Scale dst_scale, const void* biases,
     ZeroPoint lhs_zero_point, size_t n, size_t k, size_t block_height, size_t block_width) {
     // The RHS data matrix is reordered according to the blocking parameters.
@@ -29,7 +31,8 @@ std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized(
     //   final_scales[n_index] = lhs_scale * rhs_scales[n_index] / dst_scale.
     const auto scale_multiplier = lhs_scale / dst_scale;
     auto combined_scales = mul<Scale>(scales, 1, n, &scale_multiplier, 1, 1);
-    combined_scales.resize(round_up_multiple(n, block_height) * sizeof(Scale));  // Pads with 0s.
+    combined_scales = pad_matrix<Scale>(
+        combined_scales.data(), 1, n, 0, 0, round_up_multiple(n, block_height) - n, 0, 0);  // Pads with 0s.
 
     // The effective per-channel biases:
     //   final_biases[n_index] = biases[n_index] - lhs_zero_point * sum(data[n_index, :]).
@@ -37,7 +40,8 @@ std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized(
     // Reduced across width earlier, so lhs width is now 1
     const auto row_sum_times_lhs_zp = mul<ZeroPoint>(row_sum_reduced.data(), n, 1, &lhs_zero_point, 1, 1);
     auto combined_biases = sub<ZeroPoint>(biases, 1, n, row_sum_times_lhs_zp.data(), 1, n);
-    combined_biases.resize(round_up_multiple(n, block_height) * sizeof(ZeroPoint));  // Pads with 0s.
+    combined_biases = pad_matrix<ZeroPoint>(
+        combined_biases.data(), 1, n, 0, 0, round_up_multiple(n, block_height) - n, 0, 0);  // Pads with 0s.
 
     // Packs the effective biases followed by the data block followed by the effective scales for the block.
     auto packed_rhs = pack_zero_points_data_scales_per_block<ZeroPoint, Data, Scale>(
@@ -47,7 +51,7 @@ std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized(
     return packed_rhs;
 }
 
-template std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized<int8_t>(
+template Buffer matmul_pack_rhs_nxk_static_quantized<int8_t>(
     const void* data, const void* scales, float lhs_scale, float dst_scale, const void* biases, int32_t lhs_zero_point,
     size_t n, size_t k, size_t block_height, size_t block_width);
 
diff --git a/test/reference/matmul_pack.hpp b/test/reference/matmul_pack.hpp
index 30646c98..ea713dd7 100644
--- a/test/reference/matmul_pack.hpp
+++ b/test/reference/matmul_pack.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 /// Packs the RHS buffer for static quantized GeMM.
@@ -37,7 +39,7 @@ namespace kai::test {
 ///
 /// @return The packed RHS.
 template <typename Data, typename Scale, typename ZeroPoint>
-std::vector<uint8_t> matmul_pack_rhs_nxk_static_quantized(
+Buffer matmul_pack_rhs_nxk_static_quantized(
     const void* data, const void* scales, Scale lhs_scale, Scale dst_scale, const void* biases,
     ZeroPoint lhs_zero_point, size_t n, size_t k, size_t block_height, size_t block_width);
 
diff --git a/test/reference/pack.cpp b/test/reference/pack.cpp
index 055026f7..e06e3303 100644
--- a/test/reference/pack.cpp
+++ b/test/reference/pack.cpp
@@ -16,6 +16,7 @@
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
+#include "test/common/buffer.hpp"
 #include "test/common/data_format.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/float16.hpp"
@@ -39,13 +40,13 @@ BFloat16 convert(const uint8_t* src_ptr_elm, DataType src_dtype, DataType dst_dt
     }
 }
 
-std::vector<uint8_t> pack_block(
+Buffer pack_block(
     const void* src, DataType src_dtype, DataType dst_dtype, size_t src_esize, size_t dst_esize, size_t full_height,
     size_t full_width, size_t block_height, size_t block_width, size_t subblock_height, size_t subblock_width) {
     const auto dst_bytes =
         round_up_multiple(full_height, block_height) * round_up_multiple(full_width, block_width) * dst_esize;
 
-    std::vector<uint8_t> dst(dst_bytes, 0);
+    Buffer dst(dst_bytes, 0);
 
     const auto* src_ptr = reinterpret_cast<const uint8_t*>(src);
     auto* dst_ptr = dst.data();
@@ -97,7 +98,7 @@ std::vector<uint8_t> pack_block(
 }
 
 /// Packs the matrix from raw to per-row bias format.
-std::vector<uint8_t> pack_bias_per_row(
+Buffer pack_bias_per_row(
     DataType src_dtype, DataType bias_dtype, DataType dst_dtype, size_t src_esize, size_t bias_esize, size_t dst_esize,
     const void* src, const void* bias, size_t height, size_t width, size_t block_height, size_t block_width,
     size_t subblock_height, size_t subblock_width) {
@@ -110,7 +111,7 @@ std::vector<uint8_t> pack_bias_per_row(
     const auto group_bytes = group_bias_bytes + group_num_blocks * block_data_bytes;
     const auto dst_bytes = num_groups * group_bytes;
 
-    std::vector<uint8_t> dst(dst_bytes, 0);
+    Buffer dst(dst_bytes, 0);
 
     const auto* src_ptr = reinterpret_cast<const uint8_t*>(src);
     const auto* bias_ptr = reinterpret_cast<const uint8_t*>(bias);
@@ -170,7 +171,7 @@ std::vector<uint8_t> pack_bias_per_row(
 
 }  // namespace
 
-std::vector<uint8_t> pack(
+Buffer pack(
     const DataFormat& dst_format, const void* src, [[maybe_unused]] const void* scales, const void* bias,
     const DataFormat& src_format, size_t height, size_t width) {
     const auto dst_dt = dst_format.data_type();
@@ -219,8 +220,7 @@ std::vector<uint8_t> pack(
 }
 
 template <typename Data, typename Scale>
-std::vector<uint8_t> pack_data_scales(
-    const void* data, const void* scales, size_t height, size_t width, size_t quant_width) {
+Buffer pack_data_scales(const void* data, const void* scales, size_t height, size_t width, size_t quant_width) {
     KAI_ASSUME_IF(size_in_bits<Data> < 8, quant_width % (8 / size_in_bits<Data>) == 0);
     KAI_ASSUME_IF(size_in_bits<Data> < 8, width % (8 / size_in_bits<Data>) == 0);
 
@@ -229,7 +229,7 @@ std::vector<uint8_t> pack_data_scales(
     const auto data_bytes = height * width * size_in_bits<Data> / 8;
     const auto scales_bytes = height * num_quant_packets_x * sizeof(Scale);
 
-    std::vector<uint8_t> dst(data_bytes + scales_bytes);
+    Buffer dst(data_bytes + scales_bytes);
 
     const auto* scales_ptr = reinterpret_cast<const Scale*>(scales);
     auto* dst_ptr = dst.data();
@@ -251,13 +251,13 @@ std::vector<uint8_t> pack_data_scales(
         }
     }
 
-    KAI_ASSERT(dst_ptr == &*dst.end());
+    KAI_ASSERT(dst_ptr == dst.data() + dst.size());
 
     return dst;
 }
 
 template <typename ZeroPoint, typename Data, typename Scale>
-std::vector<uint8_t> pack_zero_points_data_scales_per_block(
+Buffer pack_zero_points_data_scales_per_block(
     const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points,
     size_t block_num_data, size_t block_num_scales) {
     // Only data is allowed to be sub-byte.
@@ -272,7 +272,7 @@ std::vector<uint8_t> pack_zero_points_data_scales_per_block(
     KAI_ASSUME(
         (block_num_data * size_in_bits<Data> + block_num_scales * size_in_bits<Scale>) % size_in_bits<ZeroPoint> == 0);
 
-    std::vector<uint8_t> dst(round_up_division(
+    Buffer dst(round_up_division(
         num_blocks *
             (block_num_zero_points * size_in_bits<ZeroPoint> + block_num_data * size_in_bits<Data> +
              block_num_scales * size_in_bits<Scale>),
@@ -297,17 +297,17 @@ std::vector<uint8_t> pack_zero_points_data_scales_per_block(
         dst_ptr += block_num_scales * sizeof(Scale);
     }
 
-    KAI_ASSERT(dst_ptr == &*dst.end());
+    KAI_ASSERT(dst_ptr == dst.data() + dst.size());
 
     return dst;
 }
 
-template std::vector<uint8_t> pack_zero_points_data_scales_per_block<int32_t, int8_t, float>(
+template Buffer pack_zero_points_data_scales_per_block<int32_t, int8_t, float>(
     const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points,
     size_t block_num_data, size_t block_num_scales);
 
 template <typename Data, typename Scale>
-std::vector<uint8_t> pack_data_scales_interleave_block(
+Buffer pack_data_scales_interleave_block(
     const void* data, const void* scales, size_t height, size_t width, size_t quant_width) {
     KAI_ASSUME_IF(size_in_bits<Data> < 8, quant_width % (8 / size_in_bits<Data>) == 0);
     KAI_ASSUME_IF(size_in_bits<Data> < 8, width % (8 / size_in_bits<Data>) == 0);
@@ -319,7 +319,7 @@ std::vector<uint8_t> pack_data_scales_interleave_block(
     const auto data_bytes = height * width * size_in_bits<Data> / 8;
     const auto scales_bytes = scales != nullptr ? height * num_quant_packets_x * sizeof(Scale) : 0;
 
-    std::vector<uint8_t> dst(data_bytes + scales_bytes);
+    Buffer dst(data_bytes + scales_bytes);
 
     const auto* scales_ptr = reinterpret_cast<const Scale*>(scales);
     auto* dst_ptr = dst.data();
@@ -341,18 +341,18 @@ std::vector<uint8_t> pack_data_scales_interleave_block(
         }
     }
 
-    KAI_ASSERT(dst_ptr == &*dst.end());
+    KAI_ASSERT(dst_ptr == dst.data() + dst.size());
 
     return dst;
 }
 
-template std::vector<uint8_t> pack_data_scales_interleave_block<UInt4, Float16>(
+template Buffer pack_data_scales_interleave_block<UInt4, Float16>(
     const void* data, const void* scales, size_t height, size_t width, size_t quant_width);
-template std::vector<uint8_t> pack_data_scales_interleave_block<UInt4, std::nullptr_t>(
+template Buffer pack_data_scales_interleave_block<UInt4, std::nullptr_t>(
     const void* data, const void* scales, size_t height, size_t width, size_t quant_width);
 
 template <typename Data, typename ZeroPoint, typename Scale, typename Bias>
-std::vector<uint8_t> pack_block_data_zero_points_scale_bias(
+Buffer pack_block_data_zero_points_scale_bias(
     const void* data, const void* zero_points, const void* scales, const void* biases, size_t height, size_t width,
     size_t quant_height, size_t quant_width, size_t block_height, size_t block_width, size_t interleave_x_blocks) {
     if (quant_width == width) {
@@ -382,7 +382,7 @@ std::vector<uint8_t> pack_block_data_zero_points_scale_bias(
     const auto biases_bytes = has_biases ? height * sizeof(Bias) : 0;
 
     const auto dst_bytes = num_quant_packets_y * num_quant_packets_x * quant_packet_bytes + biases_bytes;
-    std::vector<uint8_t> dst(dst_bytes);
+    Buffer dst(dst_bytes);
 
     const auto* zero_points_ptr = reinterpret_cast<const ZeroPoint*>(zero_points);
     const auto* scales_ptr = reinterpret_cast<const Scale*>(scales);
@@ -445,7 +445,7 @@ std::vector<uint8_t> pack_block_data_zero_points_scale_bias(
         }
     }
 
-    KAI_ASSERT(dst_ptr == &*dst.end());
+    KAI_ASSERT(dst_ptr == dst.data() + dst.size());
 
     return dst;
 }
diff --git a/test/reference/pack.hpp b/test/reference/pack.hpp
index 63c94d58..b424d5ba 100644
--- a/test/reference/pack.hpp
+++ b/test/reference/pack.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 class DataFormat;
@@ -21,7 +23,7 @@ class DataFormat;
 /// @param[in] src_format Data format of the source matrix.
 /// @param[in] height Number of rows of the source matrix.
 /// @param[in] width Number of columns of the source matrix.
-std::vector<uint8_t> pack(
+Buffer pack(
     const DataFormat& dst_format, const void* src, const void* scales, const void* bias, const DataFormat& src_format,
     size_t height, size_t width);
 
@@ -76,8 +78,7 @@ std::vector<uint8_t> pack(
 ///
 /// @return The packed data buffer.
 template <typename Data, typename Scale>
-std::vector<uint8_t> pack_data_scales(
-    const void* data, const void* scales, size_t height, size_t width, size_t quant_width);
+Buffer pack_data_scales(const void* data, const void* scales, size_t height, size_t width, size_t quant_width);
 
 /// Packs the zero point, data and scale into a single buffer.
 ///
@@ -139,7 +140,7 @@ std::vector<uint8_t> pack_data_scales(
 ///
 /// @return The packed data buffer.
 template <typename ZeroPoint, typename Data, typename Scale>
-std::vector<uint8_t> pack_zero_points_data_scales_per_block(
+Buffer pack_zero_points_data_scales_per_block(
     const void* zero_points, const void* data, const void* scales, size_t num_blocks, size_t block_num_zero_points,
     size_t block_num_data, size_t block_num_scales);
 
@@ -197,7 +198,7 @@ std::vector<uint8_t> pack_zero_points_data_scales_per_block(
 ///
 /// @return The packed data buffer.
 template <typename Data, typename Scale>
-std::vector<uint8_t> pack_data_scales_interleave_block(
+Buffer pack_data_scales_interleave_block(
     const void* data, const void* scales, size_t height, size_t width, size_t quant_width);
 
 /// Packs the quantized data with two halves of a block interleaved.
@@ -235,7 +236,7 @@ std::vector<uint8_t> pack_data_scales_interleave_block(
 ///
 /// @return The packed data buffer.
 template <typename Data>
-std::vector<uint8_t> pack_data_interleave_block(const void* data, size_t height, size_t width, size_t block_width) {
+Buffer pack_data_interleave_block(const void* data, size_t height, size_t width, size_t block_width) {
     return pack_data_scales_interleave_block<Data, std::nullptr_t>(data, nullptr, height, width, block_width);
 }
 
diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp
index 5b6f2b8d..182f6dc8 100644
--- a/test/reference/pad.cpp
+++ b/test/reference/pad.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -12,16 +12,18 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/memory.hpp"
+#include "test/common/round.hpp"
 
 namespace kai::test {
 
 template <typename T>
-std::vector<uint8_t> pad_row(
+Buffer pad_row(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size, const uint8_t val) {
-    std::vector<uint8_t> output(dst_size, val);
+    Buffer output(dst_size, val);
 
     for (size_t y = 0; y < height; ++y) {
         for (size_t x = 0; x < width; ++x) {
@@ -31,11 +33,43 @@ std::vector<uint8_t> pad_row(
     }
     return output;
 }
-template std::vector<uint8_t> pad_row<Int4>(
+template Buffer pad_row<Int4>(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size, const uint8_t val);
 
-template std::vector<uint8_t> pad_row<UInt4>(
+template Buffer pad_row<UInt4>(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size, const uint8_t val);
+
+template <typename T>
+Buffer pad_matrix(
+    const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom,
+    T pad_value) {
+    const size_t dst_height = height + pad_top + pad_bottom;
+    const size_t dst_width = width + pad_left + pad_right;
+    const size_t dst_size = round_up_multiple(dst_height * dst_width * size_in_bits<T>, 8);
+
+    Buffer dst(dst_size);
+
+    for (size_t y = 0; y < dst_height; ++y) {
+        for (size_t x = 0; x < dst_width; ++x) {
+            if (y >= pad_top && y < pad_top + height && x >= pad_left && x < pad_left + width) {
+                const T value = read_array<T>(data, (y - pad_top) * width + x - pad_left);
+                write_array<T>(dst.data(), y * dst_width + x, value);
+            } else {
+                write_array<T>(dst.data(), y * dst_width + x, pad_value);
+            }
+        }
+    }
+
+    return dst;
+}
+
+template Buffer pad_matrix(
+    const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom,
+    float pad_value);
+template Buffer pad_matrix(
+    const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom,
+    int32_t pad_value);
+
 }  // namespace kai::test
diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp
index 2f46639d..74c0229a 100644
--- a/test/reference/pad.hpp
+++ b/test/reference/pad.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -28,8 +29,25 @@ namespace kai::test {
 /// @return The padded matrix.
 ///
 template <typename T>
-std::vector<uint8_t> pad_row(
+Buffer pad_row(
     const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size,
     uint8_t val = 0);
 
+/// Pads the matrix with value.
+///
+/// @param[in] data The input data buffer.
+/// @param[in] height The number of input rows.
+/// @param[in] width The number of input columns.
+/// @param[in] pad_left The number of element padded to the left.
+/// @param[in] pad_top The number of element padded to the top.
+/// @param[in] pad_right The number of element padded to the right.
+/// @param[in] pad_bottom The number of element padded to the bottom.
+/// @param[in] pad_value The padding value.
+///
+/// @return The padded matrix.
+template <typename T>
+Buffer pad_matrix(
+    const void* data, size_t height, size_t width, size_t pad_left, size_t pad_top, size_t pad_right, size_t pad_bottom,
+    T pad_value);
+
 }  // namespace kai::test
diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp
index 477a13c2..c64d8404 100644
--- a/test/reference/quantize.cpp
+++ b/test/reference/quantize.cpp
@@ -14,6 +14,7 @@
 #include <vector>
 
 #include "test/common/bfloat16.hpp"
+#include "test/common/buffer.hpp"
 #include "test/common/int4.hpp"
 #include "test/common/memory.hpp"
 #include "test/common/numeric_limits.hpp"
@@ -77,8 +78,7 @@ IntType quantize_asymmetric(FloatType value, FloatType scale, ZeroPointType zero
 template int8_t quantize_asymmetric(float value, float scale, int32_t zero_point);
 
 template <typename SrcType, typename DstType, typename ScaleType>
-std::vector<uint8_t> compute_symmetric_per_block_quantization_info(
-    const void* src, size_t height, size_t width, size_t quant_width) {
+Buffer compute_symmetric_per_block_quantization_info(const void* src, size_t height, size_t width, size_t quant_width) {
     static_assert(is_floating_point<SrcType>);
     static_assert(is_integral<DstType>);
     static_assert(is_floating_point<ScaleType>);
@@ -88,7 +88,7 @@ std::vector<uint8_t> compute_symmetric_per_block_quantization_info(
     const auto num_quant_packets_x = round_up_division(width, quant_width);
 
     const auto scales_bytes = height * num_quant_packets_x * sizeof(ScaleType);
-    std::vector<uint8_t> scales(scales_bytes);
+    Buffer scales(scales_bytes);
 
     const auto* src_ptr = reinterpret_cast<const SrcType*>(src);
 
@@ -117,7 +117,7 @@ std::vector<uint8_t> compute_symmetric_per_block_quantization_info(
 }
 
 template <typename SrcType, typename DstType, typename ScaleType>
-std::vector<uint8_t> quantize_symmetric_per_block(
+Buffer quantize_symmetric_per_block(
     const void* src, const void* scales, size_t height, size_t width, size_t quant_width) {
     static_assert(is_floating_point<SrcType>);
     static_assert(is_integral<DstType>);
@@ -126,7 +126,7 @@ std::vector<uint8_t> quantize_symmetric_per_block(
     const auto num_quant_packets_x = round_up_division(width, quant_width);
 
     const auto data_bytes = round_up_division(height * width * size_in_bits<DstType>, 8);
-    std::vector<uint8_t> data(data_bytes);
+    Buffer data(data_bytes);
 
     const auto* src_ptr = reinterpret_cast<const SrcType*>(src);
 
@@ -148,11 +148,11 @@ std::vector<uint8_t> quantize_symmetric_per_block(
     return data;
 }
 
-template std::vector<uint8_t> quantize_symmetric_per_block<float, int32_t, float>(
+template Buffer quantize_symmetric_per_block<float, int32_t, float>(
     const void* src, const void* scales, size_t height, size_t width, size_t quant_width);
 
 template <typename SrcType, typename DstType, typename ScaleType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic(
+std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic(
     const void* src, size_t height, size_t width, size_t quant_width) {
     auto scales_src_type =
         compute_symmetric_per_block_quantization_info<SrcType, DstType, SrcType>(src, height, width, quant_width);
@@ -160,30 +160,30 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_bl
         src, scales_src_type.data(), height, width, quant_width);
 
     if constexpr (std::is_same_v<ScaleType, SrcType>) {
-        return {data, scales_src_type};
+        return {std::move(data), std::move(scales_src_type)};
     } else {
         auto scales =
             cast<ScaleType, SrcType>(scales_src_type.data(), scales_src_type.size() * 8 / size_in_bits<SrcType>);
 
-        return {data, scales};
+        return {std::move(data), std::move(scales)};
     }
 }
 
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, Int4, Float16>(const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, Int4, float>(const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, Int4, BFloat16>(const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, int8_t, Float16>(const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, int8_t, float>(const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic<
-    float, int32_t, float>(const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, Int4, Float16>(
+    const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, Int4, float>(
+    const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, Int4, BFloat16>(
+    const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, int8_t, Float16>(
+    const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, int8_t, float>(
+    const void* src, size_t height, size_t width, size_t quant_width);
+template std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic<float, int32_t, float>(
+    const void* src, size_t height, size_t width, size_t quant_width);
 
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> compute_asymmetric_per_block_quantization_info(
+std::tuple<Buffer, Buffer> compute_asymmetric_per_block_quantization_info(
     const void* src, size_t height, size_t width, size_t quant_width) {
     static_assert(is_floating_point<SrcType>);
     static_assert(is_integral<DstType>);
@@ -195,10 +195,10 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> compute_asymmetric_per_bl
     const auto num_quant_packets_x = round_up_division(width, quant_width);
 
     const auto scales_bytes = height * num_quant_packets_x * sizeof(ScaleType);
-    std::vector<uint8_t> scales(scales_bytes);
+    Buffer scales(scales_bytes);
 
     const auto zero_points_bytes = height * num_quant_packets_x * sizeof(ZeroPointType);
-    std::vector<uint8_t> zero_points(zero_points_bytes);
+    Buffer zero_points(zero_points_bytes);
 
     for (size_t y = 0; y < height; ++y) {
         for (size_t x_quant = 0; x_quant < width; x_quant += quant_width) {
@@ -226,11 +226,11 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> compute_asymmetric_per_bl
         }
     }
 
-    return {scales, zero_points};
+    return {std::move(scales), std::move(zero_points)};
 }
 
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::vector<uint8_t> quantize_asymmetric_per_block(
+Buffer quantize_asymmetric_per_block(
     const void* src, const void* scales, const void* zero_points, size_t height, size_t width, size_t quant_width) {
     static_assert(is_floating_point<SrcType>);
     static_assert(is_integral<DstType>);
@@ -240,7 +240,7 @@ std::vector<uint8_t> quantize_asymmetric_per_block(
     const auto num_quant_packets_x = round_up_division(width, quant_width);
 
     const auto data_bytes = round_up_division(height * width * size_in_bits<DstType>, 8);
-    std::vector<uint8_t> data(data_bytes);
+    Buffer data(data_bytes);
 
     for (size_t y = 0; y < height; ++y) {
         for (size_t x_quant = 0; x_quant < width; x_quant += quant_width) {
@@ -267,7 +267,7 @@ std::vector<uint8_t> quantize_asymmetric_per_block(
 }
 
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>> quantize_asymmetric_per_block_dynamic(
+std::tuple<Buffer, Buffer, Buffer> quantize_asymmetric_per_block_dynamic(
     const void* src, size_t height, size_t width, size_t quant_width) {
     /* Calculate the asymmetric quantization information, one scaling per row  */
     auto [scales_src_type, zero_points] =
@@ -279,22 +279,19 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>> qua
         src, scales_src_type.data(), zero_points.data(), height, width, quant_width);
 
     if constexpr (std::is_same_v<ScaleType, SrcType>) {
-        return {data, scales_src_type, zero_points};
+        return {std::move(data), std::move(scales_src_type), std::move(zero_points)};
     } else {
         auto scales =
             cast<ScaleType, SrcType>(scales_src_type.data(), scales_src_type.size() * 8 / size_in_bits<SrcType>);
 
-        return {data, scales, zero_points};
+        return {std::move(data), std::move(scales), std::move(zero_points)};
     }
 }
 
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>>
-quantize_asymmetric_per_block_dynamic<float, int8_t, float, int32_t>(
+template std::tuple<Buffer, Buffer, Buffer> quantize_asymmetric_per_block_dynamic<float, int8_t, float, int32_t>(
     const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>>
-quantize_asymmetric_per_block_dynamic<float, int8_t, BFloat16, int32_t>(
+template std::tuple<Buffer, Buffer, Buffer> quantize_asymmetric_per_block_dynamic<float, int8_t, BFloat16, int32_t>(
     const void* src, size_t height, size_t width, size_t quant_width);
-template std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>>
-quantize_asymmetric_per_block_dynamic<float, Int4, float, int32_t>(
+template std::tuple<Buffer, Buffer, Buffer> quantize_asymmetric_per_block_dynamic<float, Int4, float, int32_t>(
     const void* src, size_t height, size_t width, size_t quant_width);
 }  // namespace kai::test
diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp
index 3e2f162d..e0c7d1f0 100644
--- a/test/reference/quantize.hpp
+++ b/test/reference/quantize.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -11,6 +11,8 @@
 #include <tuple>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 /// Quantization method.
@@ -90,8 +92,7 @@ IntType quantize_asymmetric(FloatType value, FloatType scale, ZeroPointType zero
 ///
 /// @return The quantization scale matrix.
 template <typename SrcType, typename DstType, typename ScaleType>
-std::vector<uint8_t> compute_symmetric_per_block_quantization_info(
-    const void* src, size_t height, size_t width, size_t quant_width);
+Buffer compute_symmetric_per_block_quantization_info(const void* src, size_t height, size_t width, size_t quant_width);
 
 /// Quantizes each block of the matrix using symmetric quantization method.
 ///
@@ -158,7 +159,7 @@ std::vector<uint8_t> compute_symmetric_per_block_quantization_info(
 ///
 /// @return The quantized data matrix.
 template <typename SrcType, typename DstType, typename ScaleType>
-std::vector<uint8_t> quantize_symmetric_per_block(
+Buffer quantize_symmetric_per_block(
     const void* src, const void* scales, size_t height, size_t width, size_t quant_width);
 
 /// Dynamically quantizes each block of the matrix using symmetric quantization method.
@@ -184,7 +185,7 @@ std::vector<uint8_t> quantize_symmetric_per_block(
 ///
 /// @return The quantized data matrix and the quantization scale matrix.
 template <typename SrcType, typename DstType, typename ScaleType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_block_dynamic(
+std::tuple<Buffer, Buffer> quantize_symmetric_per_block_dynamic(
     const void* src, size_t height, size_t width, size_t quant_width);
 
 /// Computes the quantization information using asymmetric per-block quantization method.
@@ -258,7 +259,7 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> quantize_symmetric_per_bl
 ///
 /// @return The quantization scale matrix and the quantization zero point matrix.
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> compute_asymmetric_per_block_quantization_info(
+std::tuple<Buffer, Buffer> compute_asymmetric_per_block_quantization_info(
     const void* src, size_t height, size_t width, size_t quant_width);
 
 /// Quantizes each block of the matrix using asymmetric quantization method.
@@ -328,7 +329,7 @@ std::tuple<std::vector<uint8_t>, std::vector<uint8_t>> compute_asymmetric_per_bl
 ///
 /// @return The quantized data matrix.
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::vector<uint8_t> quantize_asymmetric_per_block(
+Buffer quantize_asymmetric_per_block(
     const void* src, const void* scales, const void* zero_points, size_t height, size_t width, size_t quant_width);
 
 /// Dynamically quantizes each block of the matrix using asymmetric quantization method.
@@ -355,7 +356,7 @@ std::vector<uint8_t> quantize_asymmetric_per_block(
 ///
 /// @return The quantized data matrix, the quantization scale matrix and the quantization zero point matrix.
 template <typename SrcType, typename DstType, typename ScaleType, typename ZeroPointType>
-std::tuple<std::vector<uint8_t>, std::vector<uint8_t>, std::vector<uint8_t>> quantize_asymmetric_per_block_dynamic(
+std::tuple<Buffer, Buffer, Buffer> quantize_asymmetric_per_block_dynamic(
     const void* src, size_t height, size_t width, size_t quant_width);
 
 }  // namespace kai::test
diff --git a/test/reference/reduce.cpp b/test/reference/reduce.cpp
index d4935c3f..7045668d 100644
--- a/test/reference/reduce.cpp
+++ b/test/reference/reduce.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/data_format.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/int4.hpp"
@@ -30,12 +31,10 @@ T scalar_reduce(T curr_value, T new_value) {
 }
 
 template <const ReductionOperator op, typename Input, typename Output>
-std::vector<uint8_t> reduce_any_op_type(const void* src, size_t height, size_t width, size_t dimension) {
-    std::vector<uint8_t> dst;
-
+Buffer reduce_any_op_type(const void* src, size_t height, size_t width, size_t dimension) {
     switch (dimension) {
-        case 0:
-            dst.resize(height * size_in_bits<Output> / 8);
+        case 0: {
+            Buffer dst(height * size_in_bits<Output> / 8);
             KAI_ASSUME(height * size_in_bits<Output> % 8 == 0);
 
             for (size_t y = 0; y < height; ++y) {
@@ -49,10 +48,11 @@ std::vector<uint8_t> reduce_any_op_type(const void* src, size_t height, size_t w
                 write_array<Output>(dst.data(), y, acc);
             }
 
-            break;
+            return dst;
+        }
 
-        case 1:
-            dst.resize(width * size_in_bits<Output> / 8);
+        case 1: {
+            Buffer dst(width * size_in_bits<Output> / 8);
             KAI_ASSUME(width * size_in_bits<Output> % 8 == 0);
 
             for (size_t x = 0; x < width; ++x) {
@@ -66,17 +66,16 @@ std::vector<uint8_t> reduce_any_op_type(const void* src, size_t height, size_t w
                 write_array<Output>(dst.data(), x, acc);
             }
 
-            break;
+            return dst;
+        }
 
         default:
             KAI_ERROR("Only 2D data is supported!");
     }
-
-    return dst;
 }
 
 template <const ReductionOperator op>
-std::vector<uint8_t> reduce_any_op(
+Buffer reduce_any_op(
     const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format,
     size_t dimension) {
     KAI_ASSUME(src_format.is_raw());
@@ -106,15 +105,15 @@ std::vector<uint8_t> reduce_any_op(
 
 }  // namespace
 
-std::vector<uint8_t> reduce_add(
+Buffer reduce_add(
     const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format,
     size_t dimension) {
     return reduce_any_op<ReductionOperator::ADD>(src, src_format, height, width, dst_format, dimension);
 }
 
 template <typename Value, typename Accumulator>
-std::vector<uint8_t> reduce_add_x(const void* src, size_t height, size_t width) {
-    std::vector<uint8_t> dst(round_up_division(height * size_in_bits<Accumulator>, 8));
+Buffer reduce_add_x(const void* src, size_t height, size_t width) {
+    Buffer dst(round_up_division(height * size_in_bits<Accumulator>, 8));
 
     for (size_t y = 0; y < height; ++y) {
         Accumulator acc = 0;
@@ -129,7 +128,7 @@ std::vector<uint8_t> reduce_add_x(const void* src, size_t height, size_t width)
     return dst;
 }
 
-template std::vector<uint8_t> reduce_add_x<int8_t, int32_t>(const void* src, size_t height, size_t width);
+template Buffer reduce_add_x<int8_t, int32_t>(const void* src, size_t height, size_t width);
 
 template <typename T>
 T reduce_min(const void* src, size_t len) {
diff --git a/test/reference/reduce.hpp b/test/reference/reduce.hpp
index 449dc57f..8341f442 100644
--- a/test/reference/reduce.hpp
+++ b/test/reference/reduce.hpp
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 class DataFormat;
@@ -29,7 +31,7 @@ enum class ReductionOperator : uint32_t {
 /// @param[in] dimension Reduction dimension.
 ///
 /// @return The reduced matrix.
-std::vector<uint8_t> reduce_add(
+Buffer reduce_add(
     const void* src, const DataFormat& src_format, size_t height, size_t width, const DataFormat& dst_format,
     size_t dimension);
 
@@ -44,7 +46,7 @@ std::vector<uint8_t> reduce_add(
 ///
 /// @return The vector containing the sum of each input matrix row.
 template <typename Value, typename Accumulator>
-std::vector<uint8_t> reduce_add_x(const void* src, size_t height, size_t width);
+Buffer reduce_add_x(const void* src, size_t height, size_t width);
 
 /// Retrieve the minimum value in a provided matrix.
 ///
diff --git a/test/reference/reorder.cpp b/test/reference/reorder.cpp
index 61ba67d1..2ab8eed2 100644
--- a/test/reference/reorder.cpp
+++ b/test/reference/reorder.cpp
@@ -10,18 +10,18 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/memory.hpp"
 #include "test/common/round.hpp"
 
 namespace kai::test {
 
 template <typename T>
-std::vector<uint8_t> reorder_block(
-    const void* src, size_t height, size_t width, size_t block_height, size_t block_width) {
+Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_height, size_t block_width) {
     const auto num_dst_elements = round_up_multiple(height, block_height) * round_up_multiple(width, block_width);
     const auto dst_size = round_up_division(num_dst_elements * size_in_bits<T>, 8);
 
-    std::vector<uint8_t> dst(dst_size);
+    Buffer dst(dst_size);
     size_t dst_index = 0;
 
     for (size_t y_block = 0; y_block < height; y_block += block_height) {
@@ -44,9 +44,9 @@ std::vector<uint8_t> reorder_block(
     return dst;
 }
 
-template std::vector<uint8_t> reorder_block<int8_t>(
+template Buffer reorder_block<int8_t>(
     const void* src, size_t height, size_t width, size_t block_height, size_t block_width);
-template std::vector<uint8_t> reorder_block<const void*>(
+template Buffer reorder_block<const void*>(
     const void* src, size_t height, size_t width, size_t block_height, size_t block_width);
 
 }  // namespace kai::test
diff --git a/test/reference/reorder.hpp b/test/reference/reorder.hpp
index 48449e37..514c2ea4 100644
--- a/test/reference/reorder.hpp
+++ b/test/reference/reorder.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
+
 namespace kai::test {
 
 /// Reorders the input matrix block by block.
@@ -66,7 +68,6 @@ namespace kai::test {
 /// @param[in] The reordered matrix.
 /// ```
 template <typename T>
-std::vector<uint8_t> reorder_block(
-    const void* src, size_t height, size_t width, size_t block_height, size_t block_width);
+Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_height, size_t block_width);
 
 }  // namespace kai::test
diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp
index 84958422..1a3dd8c1 100644
--- a/test/reference/transpose.cpp
+++ b/test/reference/transpose.cpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -12,18 +12,18 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 #include "test/common/memory.hpp"
 #include "test/common/round.hpp"
 
 namespace kai::test {
 
-std::vector<uint8_t> transpose(const void* data, DataType data_type, size_t height, size_t width) {
+Buffer transpose(const void* data, DataType data_type, size_t height, size_t width) {
     KAI_ASSUME(data_type_size_in_bits(data_type) % 8 == 0);
     const auto element_size = data_type_size_in_bits(data_type) / 8;
 
-    std::vector<uint8_t> output;
-    output.resize(height * width * element_size);
+    Buffer output(height * width * element_size);
 
     const auto* src_ptr = reinterpret_cast<const uint8_t*>(data);
 
@@ -39,10 +39,10 @@ std::vector<uint8_t> transpose(const void* data, DataType data_type, size_t heig
 }
 
 template <typename T>
-std::vector<uint8_t> transpose_with_padding(
+Buffer transpose_with_padding(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size) {
-    std::vector<uint8_t> output(dst_size);
+    Buffer output(dst_size);
 
     for (size_t y = 0; y < width; ++y) {
         for (size_t x = 0; x < height; ++x) {
@@ -54,17 +54,17 @@ std::vector<uint8_t> transpose_with_padding(
     return output;
 }
 
-template std::vector<uint8_t> transpose_with_padding<Int4>(
+template Buffer transpose_with_padding<Int4>(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size);
 
-template std::vector<uint8_t> transpose_with_padding<int8_t>(
+template Buffer transpose_with_padding<int8_t>(
     const void* data, const size_t height, const size_t width, const size_t src_stride, const size_t dst_stride,
     const size_t dst_size);
 
 template <typename T>
-std::vector<uint8_t> transpose(const void* src, size_t height, size_t width) {
-    std::vector<uint8_t> dst(round_up_division(height * width * size_in_bits<T>, 8));
+Buffer transpose(const void* src, size_t height, size_t width) {
+    Buffer dst(round_up_division(height * width * size_in_bits<T>, 8));
 
     for (size_t y = 0; y < width; ++y) {
         for (size_t x = 0; x < height; ++x) {
@@ -75,7 +75,7 @@ std::vector<uint8_t> transpose(const void* src, size_t height, size_t width) {
     return dst;
 }
 
-template std::vector<uint8_t> transpose<float>(const void* src, size_t height, size_t width);
-template std::vector<uint8_t> transpose<int8_t>(const void* src, size_t height, size_t width);
+template Buffer transpose<float>(const void* src, size_t height, size_t width);
+template Buffer transpose<int8_t>(const void* src, size_t height, size_t width);
 
 }  // namespace kai::test
diff --git a/test/reference/transpose.hpp b/test/reference/transpose.hpp
index 306bc89d..11f031aa 100644
--- a/test/reference/transpose.hpp
+++ b/test/reference/transpose.hpp
@@ -1,5 +1,5 @@
 //
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
 //
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <vector>
 
+#include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
 
 namespace kai::test {
@@ -22,7 +23,7 @@ namespace kai::test {
 /// @param[in] width Number of columns.
 ///
 /// @return The transposed matrix.
-std::vector<uint8_t> transpose(const void* data, DataType data_type, size_t height, size_t width);
+Buffer transpose(const void* data, DataType data_type, size_t height, size_t width);
 
 /// Transposes the matrix.
 /// Works for non-packed and packed using provided strides.
@@ -37,7 +38,7 @@ std::vector<uint8_t> transpose(const void* data, DataType data_type, size_t heig
 /// @return The transposed matrix.
 ///
 template <typename T>
-std::vector<uint8_t> transpose_with_padding(
+Buffer transpose_with_padding(
     const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size);
 
 ///
@@ -49,6 +50,6 @@ std::vector<uint8_t> transpose_with_padding(
 ///
 /// @return The transposed matrix.
 template <typename T>
-std::vector<uint8_t> transpose(const void* src, size_t height, size_t width);
+Buffer transpose(const void* src, size_t height, size_t width);
 
 }  // namespace kai::test
diff --git a/test/tests/buffer_test.cpp b/test/tests/buffer_test.cpp
index fbd5e259..e6cca044 100644
--- a/test/tests/buffer_test.cpp
+++ b/test/tests/buffer_test.cpp
@@ -43,7 +43,7 @@ TEST(Buffer, NonePolicy) {
 
         const auto buffer = Buffer(buffer_size);
 
-        const auto* data = static_cast<uint8_t*>(buffer.data());
+        const auto* data = reinterpret_cast<uint8_t*>(buffer.data());
         ASSERT_NE(data, nullptr);
     }
 
@@ -99,7 +99,7 @@ TEST(Buffer, ProtectUnderflowPolicy) {
 
         const auto buffer = Buffer(buffer_size);
 
-        const auto* data = static_cast<uint8_t*>(buffer.data());
+        const auto* data = reinterpret_cast<uint8_t*>(buffer.data());
         ASSERT_NE(data, nullptr);
         ASSERT_NE(data, MAP_FAILED);
 
@@ -141,7 +141,7 @@ TEST(Buffer, ProtectOverflowPolicy) {
 
         const auto buffer = Buffer(buffer_size);
 
-        const auto* data = static_cast<uint8_t*>(buffer.data());
+        const auto* data = reinterpret_cast<uint8_t*>(buffer.data());
         ASSERT_NE(data, nullptr);
         ASSERT_NE(data, MAP_FAILED);
 
diff --git a/test/tests/imatmul_test.cpp b/test/tests/imatmul_test.cpp
index dd12fd4f..70b28e55 100644
--- a/test/tests/imatmul_test.cpp
+++ b/test/tests/imatmul_test.cpp
@@ -20,6 +20,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/matmul_test_common.hpp"
@@ -121,9 +122,6 @@ struct IndirectMatMul {
     MatMulIndirectKernel imatmul;
 };
 
-/// Simple byte buffer
-using Buffer = std::vector<uint8_t>;
-
 /// Convenience type for test list
 using IndirectMatMulArray = std::array<IndirectMatMul, 2>;
 
diff --git a/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp
index 2363ace7..66f92189 100644
--- a/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp
+++ b/test/tests/matmul_clamp_f16_bf16p_bf16p_test.cpp
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -125,13 +126,13 @@ private:
 protected:
     /// Cached test data that is shared between multiple test case.
     struct TestData {
-        std::vector<uint8_t> lhs{};             ///< LHS operand.
-        std::vector<uint8_t> ref_packed_lhs{};  ///< Reference packed LHS.
-        std::vector<uint8_t> rhs{};             ///< RHS operand.
-        std::vector<uint8_t> rhs_scales{};      ///< RHS per-row quantization scales.
-        std::vector<uint8_t> bias{};            ///< Bias.
-        std::vector<uint8_t> ref_packed_rhs{};  ///< Reference packed RHS.
-        std::vector<uint8_t> ref_dst{};         ///< Reference output.
+        Buffer lhs{};             ///< LHS operand.
+        Buffer ref_packed_lhs{};  ///< Reference packed LHS.
+        Buffer rhs{};             ///< RHS operand.
+        Buffer rhs_scales{};      ///< RHS per-row quantization scales.
+        Buffer bias{};            ///< Bias.
+        Buffer ref_packed_rhs{};  ///< Reference packed RHS.
+        Buffer ref_dst{};         ///< Reference output.
     };
 
     /// Gets the test data for the current test case.
@@ -154,7 +155,7 @@ protected:
         const auto lhs_h = info.m;
         const auto lhs_w = info.k;
         auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0);
-        std::vector<uint8_t> ref_packed_lhs;
+        Buffer ref_packed_lhs;
 
         if (has_lhs_pack) {
             ref_packed_lhs =
@@ -165,7 +166,7 @@ protected:
         const auto rhs_w = info.n;
         auto rhs = fill_matrix_random(rhs_h, rhs_w, method.rhs_format, 1);
 
-        std::vector<uint8_t> rhs_scales;
+        Buffer rhs_scales;
         if (data_type_is_quantized(method.rhs_format.data_type()) &&
             method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) {
             rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2);
@@ -173,14 +174,13 @@ protected:
 
         const auto bias_h = 1;
         const auto bias_w = info.n;
-        std::vector<uint8_t> bias;
+        Buffer bias;
 
         if (has_bias) {
             bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3);
         }
 
-        std::vector<uint8_t> packed_rhs;
-        packed_rhs.resize(method.fn_get_packed_rhs_size(rhs_w, rhs_h));
+        Buffer packed_rhs(method.fn_get_packed_rhs_size(rhs_w, rhs_h));
 
         if (has_rhs_pack) {
             const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w);
@@ -257,9 +257,8 @@ TEST_P(MatMulTestBf16OutFp16, Output) {
     const auto lhs_start_row = rect.start_row();
     const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w);
 
-    std::vector<uint8_t> lhs_data;
     const size_t lhs_packed_size = method.fn_get_packed_lhs_size(info.m, info.k, method.m0, method.k0, 1 /* sr */);
-    lhs_data.resize(lhs_packed_size);
+    Buffer lhs_data(lhs_packed_size);
 
     uintptr_t lhs_offset = method.fn_get_lhs_offset(lhs_start_row, lhs_stride);
     uintptr_t lhs_packed_offset = method.fn_get_packed_lhs_offset(lhs_start_row, info.k);
@@ -271,9 +270,8 @@ TEST_P(MatMulTestBf16OutFp16, Output) {
 
     const auto rhs_stride = method.rhs_format.default_row_stride(info.n);
 
-    std::vector<uint8_t> rhs_data;
     const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k);
-    rhs_data.resize(rhs_packed_size);
+    Buffer rhs_data(rhs_packed_size);
 
     const auto packed_rhs_start_row = rect.start_col();
     const auto packed_rhs_start_col = 0;
@@ -309,8 +307,7 @@ TEST_P(MatMulTestBf16OutFp16, Output) {
     const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n);
     ASSERT_EQ(dst_size, ref_dst_size);
 
-    std::vector<uint8_t> dst;
-    dst.resize(dst_size);
+    Buffer dst(dst_size);
     method.main_kernel(
         rect.height(), rect.width(), info.k, lhs_data.data() + lhs_packed_offset, rhs_data.data() + rhs_packed_offset,
         NULL, dst.data() + dst_offset, lhs_stride, rhs_stride, dst_stride, -std::numeric_limits<float>::infinity(),
diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
index de55cecf..795abc06 100644
--- a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
@@ -22,6 +22,7 @@
 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp_qsi4cxp_interface.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -93,7 +94,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) {
     // Generates input data.
     const auto ref_lhs_f16 = fill_random<Float16>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
-    std::vector<uint8_t> ref_biases;
+    Buffer ref_biases;
 
     if (has_bias) {
         ref_biases = fill_random<float>(N, seed + 2);
@@ -128,7 +129,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) {
     // Runs the LHS packing micro-kernel.
     const auto lhs_start_row = rect.start_row();
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_stride = K * sizeof(uint16_t);
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon(lhs_start_row, lhs_stride);
@@ -145,7 +146,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) {
         ref_rhs_qsi4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2));
 
     const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(N, K, nr, kr, sr);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(rhs_start_row, K, nr, kr, sr);
     auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K);
@@ -157,7 +158,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) {
     params.rhs_zero_point = 0;
 
     kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(
-        1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(),
+        1, N, K, nr, kr, sr, reinterpret_cast<const uint8_t*>(ref_rhs_qsi4_padded.data()),
         has_bias ? reinterpret_cast<const float*>(ref_biases.data()) : nullptr,
         reinterpret_cast<const float*>(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, &params);
 
@@ -171,7 +172,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi4cxp, EndToEnd) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col,
diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
index 0500d8f6..3f59f645 100644
--- a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
@@ -24,6 +24,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f16_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -94,7 +95,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) {
     // Generates input data.
     const auto ref_lhs_f16 = fill_random<Float16>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
-    std::vector<uint8_t> ref_biases;
+    Buffer ref_biases;
 
     if (has_bias) {
         ref_biases = fill_random<float>(N, seed + 2);
@@ -129,7 +130,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) {
     // Runs the LHS packing micro-kernel.
     const auto lhs_start_row = rect.start_row();
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f16_neon(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_stride = K * sizeof(uint16_t);
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f16_neon(lhs_start_row, lhs_stride);
@@ -143,7 +144,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) {
         imp_packed_lhs.data() + lhs_packed_offset);
 
     const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(rhs_start_row, K, nr, kr, sr);
     auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K);
@@ -166,7 +167,7 @@ TEST_P(MatMulTest_f16_qai8dxp_qsi8cxp, EndToEnd) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col,
diff --git a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
index de6c3a90..14838981 100644
--- a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
@@ -20,6 +20,7 @@
 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p_qai4c32p_interface.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pscalef32_f16_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -85,7 +86,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
     // Generates input data.
     const auto ref_lhs_f16 = fill_random<Float16>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
-    std::vector<uint8_t> ref_biases;
+    Buffer ref_biases;
 
     if (has_bias) {
         ref_biases = fill_random<float>(N, seed + 2);
@@ -121,7 +122,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
     const auto lhs_start_row = rect.start_row();
     const auto imp_packed_lhs_size =
         kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f16_neon(M, K, bl, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_stride = K * sizeof(uint16_t);
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qsi8d32pscalef32_f16_neon(lhs_start_row, lhs_stride);
@@ -139,7 +140,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
     const size_t num_blocks_per_row = round_up_division(K, bl);
     const size_t ref_zp_size = N * num_blocks_per_row;
     const size_t ref_zp_size_in_bytes = ref_zp_size * sizeof(float);
-    std::vector<uint8_t> ref_rhs_zp_f32(ref_zp_size_in_bytes);
+    Buffer ref_rhs_zp_f32(ref_zp_size_in_bytes);
     for (size_t i = 0; i < ref_zp_size; ++i) {
         reinterpret_cast<float*>(ref_rhs_zp_f32.data())[i] =
             -reinterpret_cast<const int32_t*>(ref_rhs_zero_points.data())[i] *
@@ -154,7 +155,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
 
     const auto imp_packed_rhs_size =
         kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(N, K, nr, kr, bl);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset =
         kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(rhs_start_row, K, nr, kr, bl);
@@ -167,8 +168,8 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
     params.rhs_zero_point = 8;
 
     kai_run_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(
-        1, N, K, nr, kr, sr, bl, ref_rhs_qau4s0s1.data(), ref_rhs_zp_f32.data(), has_bias ? ref_biases.data() : nullptr,
-        ref_rhs_scales.data(), imp_packed_rhs.data(), 0, &params);
+        1, N, K, nr, kr, sr, bl, reinterpret_cast<const uint8_t*>(ref_rhs_qau4s0s1.data()), ref_rhs_zp_f32.data(),
+        has_bias ? ref_biases.data() : nullptr, ref_rhs_scales.data(), imp_packed_rhs.data(), 0, &params);
 
     const auto dst_stride_row = N * sizeof(uint16_t);
     const auto dst_stride_col = sizeof(uint16_t);
@@ -180,7 +181,7 @@ TEST_P(MatMulTest_f16_qsi8d32p_qai4c32p, EndToEnd) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, imp_dst.data() + dst_offset, dst_stride_row, dst_stride_col,
diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
index 2b4ca713..5dd2f3b9 100644
--- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
+++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -322,13 +323,13 @@ private:
 protected:
     /// Cached test data that is shared between multiple test case.
     struct TestData {
-        std::vector<uint8_t> lhs{};             ///< LHS operand.
-        std::vector<uint8_t> ref_packed_lhs{};  ///< Reference packed LHS.
-        std::vector<uint8_t> rhs{};             ///< RHS operand.
-        std::vector<uint8_t> rhs_scales{};      ///< RHS per-row quantization scales.
-        std::vector<uint8_t> bias{};            ///< Bias.
-        std::vector<uint8_t> ref_packed_rhs{};  ///< Reference packed RHS.
-        std::vector<uint8_t> ref_dst{};         ///< Reference output.
+        Buffer lhs{};             ///< LHS operand.
+        Buffer ref_packed_lhs{};  ///< Reference packed LHS.
+        Buffer rhs{};             ///< RHS operand.
+        Buffer rhs_scales{};      ///< RHS per-row quantization scales.
+        Buffer bias{};            ///< Bias.
+        Buffer ref_packed_rhs{};  ///< Reference packed RHS.
+        Buffer ref_dst{};         ///< Reference output.
     };
 
     /// Gets the test data for the current test case.
@@ -351,7 +352,7 @@ protected:
         const auto lhs_h = info.m;
         const auto lhs_w = info.k;
         auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0);
-        std::vector<uint8_t> ref_packed_lhs;
+        Buffer ref_packed_lhs;
 
         if (has_lhs_pack) {
             ref_packed_lhs =
@@ -362,7 +363,7 @@ protected:
         const auto rhs_w = info.n;
         auto rhs = fill_matrix_random(rhs_h, rhs_w, method.rhs_format, 1);
 
-        std::vector<uint8_t> rhs_scales;
+        Buffer rhs_scales;
         if (data_type_is_quantized(method.rhs_format.data_type()) &&
             method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) {
             rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2);
@@ -370,7 +371,7 @@ protected:
 
         const auto bias_h = 1;
         const auto bias_w = info.n;
-        std::vector<uint8_t> bias;
+        Buffer bias;
 
         if (has_bias) {
             bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3);
@@ -379,11 +380,11 @@ protected:
         constexpr size_t nr = 12;
         constexpr size_t kr = 4;
 
-        std::vector<uint8_t> packed_rhs;
+        Buffer packed_rhs;
         if (method.fn_get_packed_rhs_size) {
-            packed_rhs.resize(method.fn_get_packed_rhs_size(rhs_w, rhs_h));
+            packed_rhs = Buffer(method.fn_get_packed_rhs_size(rhs_w, rhs_h));
         } else if (method.fn_get_packed_rhs_size_generic_block_size) {
-            packed_rhs.resize(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr));
+            packed_rhs = Buffer(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr));
         } else {
             KAI_ERROR("No function to calculate Packed Rhs Matrix Size");
         }
@@ -462,9 +463,8 @@ TEST_P(MatMulTestBf16, Output) {
     const auto lhs_start_row = rect.start_row();
     const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w);
 
-    std::vector<uint8_t> lhs_data;
     const size_t lhs_packed_size = method.fn_get_packed_lhs_size(info.m, info.k, method.m0, method.k0, 1 /* sr */);
-    lhs_data.resize(lhs_packed_size);
+    Buffer lhs_data(lhs_packed_size);
 
     uintptr_t lhs_offset = method.fn_get_lhs_offset(lhs_start_row, lhs_stride);
     uintptr_t lhs_packed_offset = method.fn_get_packed_lhs_offset(lhs_start_row, info.k);
@@ -476,15 +476,15 @@ TEST_P(MatMulTestBf16, Output) {
 
     const auto rhs_stride = method.rhs_format.default_row_stride(info.n);
 
-    std::vector<uint8_t> rhs_data;
+    Buffer rhs_data;
 
     if (method.fn_get_packed_rhs_size_generic_block_size) {
         const size_t rhs_packed_size =
             method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0);
-        rhs_data.resize(rhs_packed_size);
+        rhs_data = Buffer(rhs_packed_size);
     } else if (method.fn_get_packed_rhs_size) {
         const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k);
-        rhs_data.resize(rhs_packed_size);
+        rhs_data = Buffer(rhs_packed_size);
     }
 
     const auto packed_rhs_start_row = rect.start_col();
@@ -521,8 +521,7 @@ TEST_P(MatMulTestBf16, Output) {
     const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n);
     ASSERT_EQ(dst_size, ref_dst_size);
 
-    std::vector<uint8_t> dst;
-    dst.resize(dst_size);
+    Buffer dst(dst_size);
     method.main_kernel(
         rect.height(), rect.width(), info.k, lhs_data.data() + lhs_packed_offset, rhs_data.data() + rhs_packed_offset,
         NULL, dst.data() + dst_offset, lhs_stride, rhs_stride, dst_stride, -std::numeric_limits<float>::infinity(),
diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
index c80e42ad..00774b1e 100644
--- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
@@ -72,7 +72,7 @@ Buffer fill_matrix_raw(size_t height, size_t width, std::function<T(size_t, size
     KAI_ASSUME(width * size_in_bits<T> % 8 == 0);
 
     Buffer data(size);
-    auto ptr = static_cast<T*>(data.data());
+    auto ptr = reinterpret_cast<T*>(data.data());
 
     for (size_t y = 0; y < height; ++y) {
         for (size_t x = 0; x < width; ++x) {
@@ -170,7 +170,7 @@ TEST_P(MatMulTest_f32_f32_f32p, EndToEnd)  // NOLINT(google-readability-avoid-un
 
     Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
-        m, n, k, ref_lhs.data(), 1, imp_packed_rhs->data(), static_cast<float*>(imp_dst.data()), 1, 1, clamp_min,
+        m, n, k, ref_lhs.data(), 1, imp_packed_rhs->data(), reinterpret_cast<float*>(imp_dst.data()), 1, 1, clamp_min,
         clamp_max);
 
     // Compare the output of the micro-kernels against the output of the reference implementation.
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index a8d3fb8f..65304569 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -36,6 +36,7 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h"
 #include "test/common/bfloat16.hpp"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/int4.hpp"
 #include "test/common/matmul_test_common.hpp"
@@ -203,7 +204,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
@@ -227,7 +228,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
 
     const auto imp_packed_rhs_size =
         kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
 
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset =
@@ -245,7 +246,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     params.scale_dt = kai_datatype::kai_dt_bf16;
 
     kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(
-        1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride,
+        1, rect.width() /* n */, K, nr, kr, sr, bl,
+        reinterpret_cast<const uint8_t*>(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride,
         reinterpret_cast<const float*>(ref_biases.data() + bias_offset),
         reinterpret_cast<const float*>(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride,
         imp_packed_rhs.data() + rhs_packed_offset, 0, &params);
@@ -259,7 +261,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) {
     ASSERT_EQ(imp_dst_size, ref_dst.size());
 
     // Runs the GEMM micro-kernel.
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
@@ -349,7 +351,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
@@ -382,7 +384,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
 
     const auto imp_packed_rhs_size =
         kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
 
     kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{};
     params.lhs_zero_point = 1;
@@ -390,7 +392,8 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     params.scale_dt = kai_datatype::kai_dt_bf16;
 
     kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(
-        1, rect.width() /* n */, K, nr, kr, sr, bl, ref_rhs_qsu4_padded.data() + rhs_offset, ref_rhs_qsu4_stride,
+        1, rect.width() /* n */, K, nr, kr, sr, bl,
+        reinterpret_cast<const uint8_t*>(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride,
         reinterpret_cast<const float*>(ref_biases.data() + bias_offset), ref_rhs_scales.data() + scale_offset,
         ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, &params);
 
@@ -402,7 +405,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index 5a770fbe..6459c87b 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -33,6 +33,7 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/int4.hpp"
 #include "test/common/matmul_test_common.hpp"
@@ -325,7 +326,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
@@ -353,13 +354,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     size_t bias_offset = rhs_start_row * sizeof(float);
     size_t scale_offset = rhs_start_row * sizeof(float);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{};
     params.lhs_zero_point = 1;
     params.rhs_zero_point = 0;
 
     ukernel_variant.run_rhs_pack(
-        1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsi4_padded.data() + rhs_offset,
+        1, rect.width() /* n */, K, nr, kr, sr,
+        reinterpret_cast<const uint8_t*>(ref_rhs_qsi4_padded.data() + rhs_offset),
         reinterpret_cast<const float*>(ref_biases.data() + bias_offset),
         reinterpret_cast<const float*>(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset,
         0, &params);
@@ -372,7 +374,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
@@ -452,7 +454,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
@@ -481,12 +483,13 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     size_t bias_offset = rhs_start_row * sizeof(float);
     size_t scale_offset = rhs_start_row * sizeof(float);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{};
     params.lhs_zero_point = 1;
     params.rhs_zero_point = 8;
     ukernel_variant.run_rhs_pack(
-        1, rect.width() /* n */, K, nr, kr, sr, ref_rhs_qsu4_padded.data() + rhs_offset,
+        1, rect.width() /* n */, K, nr, kr, sr,
+        reinterpret_cast<const uint8_t*>(ref_rhs_qsu4_padded.data() + rhs_offset),
         reinterpret_cast<const float*>(ref_biases.data() + bias_offset),
         reinterpret_cast<const float*>(ref_rhs_scales.data() + scale_offset), imp_packed_rhs.data() + rhs_packed_offset,
         0, &params);
@@ -498,7 +501,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
@@ -592,7 +595,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
     auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K);
@@ -615,13 +618,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K);
     ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{};
     params.lhs_zero_point = 1;
     params.rhs_zero_point = 0;
     ukernel_variant.run_rhs_pack(
-        1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), reinterpret_cast<const float*>(ref_biases.data()),
-        reinterpret_cast<const float*>(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, &params);
+        1, N, K, nr, kr, sr, reinterpret_cast<const uint8_t*>(ref_rhs_qsi4_padded.data()),
+        reinterpret_cast<const float*>(ref_biases.data()), reinterpret_cast<const float*>(ref_rhs_scales.data()),
+        imp_packed_rhs.data(), 0, &params);
 
     const auto dst_stride = N * sizeof(float);
     const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride);
@@ -631,7 +635,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
@@ -723,7 +727,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride);
     auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr);
@@ -748,13 +752,14 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K);
     ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{};
     params.lhs_zero_point = 1;
     params.rhs_zero_point = 8;
     ukernel_variant.run_rhs_pack(
-        1, N, K, nr, kr, sr, ref_rhs_qsu4_padded.data(), reinterpret_cast<const float*>(ref_biases.data()),
-        reinterpret_cast<const float*>(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, &params);
+        1, N, K, nr, kr, sr, reinterpret_cast<const uint8_t*>(ref_rhs_qsu4_padded.data()),
+        reinterpret_cast<const float*>(ref_biases.data()), reinterpret_cast<const float*>(ref_rhs_scales.data()),
+        imp_packed_rhs.data(), 0, &params);
 
     const auto dst_stride = N * sizeof(float);
     const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride);
@@ -764,7 +769,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index f22671a1..64a079cd 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -24,6 +24,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/matmul_test_common.hpp"
 #include "test/common/matrix_portion.hpp"
@@ -145,7 +146,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     const auto lhs_start_row = rect.start_row();
     size_t lhs_stride = K * sizeof(float);
@@ -162,7 +163,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     //   * Packs the RHS matrix.
     const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const kai_rhs_pack_qsi8cx_params params{.lhs_zero_point = 1, .scale_multiplier = 1.0f};
     kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon(
         1, N, K, nr, kr, sr, reinterpret_cast<const int8_t*>(ref_rhs_qsi8.data()),
@@ -186,7 +187,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_nxk_qsi8cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + matmul_lhs_packed_offset,
         imp_packed_rhs.data() + matmul_rhs_packed_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
@@ -276,7 +277,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
 
     // Runs the LHS packing micro-kernel.
     const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
     kai_run_lhs_quant_pack_qai8dxp_f32(
         rect.height(), K, mr, kr, sr, 0, reinterpret_cast<const float*>(ref_lhs.data() + lhs_offset), K * sizeof(float),
         imp_packed_lhs.data() + lhs_packed_offset);
@@ -286,7 +287,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     //   * Packs the RHS matrix.
     const auto imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi8cxp_qsi8cx_neon(N, K, nr, kr, sr);
 
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const kai_rhs_pack_qsi8cx_params params{.lhs_zero_point = 1, .scale_multiplier = 1.0f};
     kai_run_rhs_pack_kxn_qsi8cxp_qsi8cx_neon(
         1, N, K, nr, kr, sr, reinterpret_cast<const int8_t*>(ref_rhs_qsi8.data()),
@@ -310,7 +311,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi8cxp, EndToEnd_RHS_kxn_qsi8cx) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, imp_packed_lhs.data() + matmul_lhs_packed_offset,
         imp_packed_rhs.data() + matmul_rhs_packed_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
index 14184151..da00f5cc 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
@@ -20,6 +20,7 @@
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p_qai4c32p_interface.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32pscalef32_f32_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon.h"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/int4.hpp"
 #include "test/common/matmul_test_common.hpp"
@@ -83,7 +84,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
     // Generates input data.
     const auto ref_lhs = fill_random<float>(M * K, seed + 0);
     const auto ref_rhs = fill_random<float>(N * K, seed + 1);
-    std::vector<uint8_t> ref_biases;
+    Buffer ref_biases;
 
     if (has_bias) {
         ref_biases = fill_random<float>(N, seed + 2);
@@ -112,7 +113,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
     const auto lhs_start_row = rect.start_row();
     const auto imp_packed_lhs_size =
         kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32pscalef32_f32_neon(M, K, bl, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_stride = K * sizeof(float);
     auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qsi8d32pscalef32_f32_neon(lhs_start_row, lhs_stride);
@@ -130,7 +131,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
     const size_t num_blocks_per_row = round_up_division(K, bl);
     const size_t ref_zp_size = N * num_blocks_per_row;
     const size_t ref_zp_size_in_bytes = ref_zp_size * sizeof(float);
-    std::vector<uint8_t> ref_rhs_zp_f32(ref_zp_size_in_bytes);
+    Buffer ref_rhs_zp_f32(ref_zp_size_in_bytes);
     for (size_t i = 0; i < ref_zp_size; ++i) {
         reinterpret_cast<float*>(ref_rhs_zp_f32.data())[i] =
             -reinterpret_cast<const int32_t*>(ref_rhs_zero_points.data())[i] *
@@ -145,7 +146,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
 
     const auto imp_packed_rhs_size =
         kai_get_rhs_packed_size_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(N, K, nr, kr, bl);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset =
         kai_get_rhs_packed_offset_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(rhs_start_row, K, nr, kr, bl);
@@ -158,8 +159,8 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
     params.rhs_zero_point = 8;
 
     kai_run_rhs_pack_nxk_qai4c32p_qau4c32s0s1_f32_f32_f32_neon(
-        1, N, K, nr, kr, sr, bl, ref_rhs_qau4s0s1.data(), ref_rhs_zp_f32.data(), has_bias ? ref_biases.data() : nullptr,
-        ref_rhs_scales.data(), imp_packed_rhs.data(), 0, &params);
+        1, N, K, nr, kr, sr, bl, reinterpret_cast<const uint8_t*>(ref_rhs_qau4s0s1.data()), ref_rhs_zp_f32.data(),
+        has_bias ? ref_biases.data() : nullptr, ref_rhs_scales.data(), imp_packed_rhs.data(), 0, &params);
 
     const auto dst_stride_row = N * sizeof(float);
     const auto dst_stride_col = sizeof(float);
@@ -171,7 +172,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qai4c32p, EndToEnd) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.interface.run_matmul(
         rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index a93a0b65..4d82a9d7 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -28,6 +28,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/float16.hpp"
 #include "test/common/int4.hpp"
@@ -208,7 +209,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
     // Runs the LHS packing micro-kernel.
     const auto lhs_start_row = rect.start_row();
     const auto imp_packed_lhs_size = ukernel_variant.pack_interface.lhs_packed_size(M, K, bl, mr, kr, sr);
-    std::vector<uint8_t> imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size);
 
     auto lhs_stride = K * sizeof(float);
     auto lhs_offset = ukernel_variant.pack_interface.get_lhs_offset(lhs_start_row, lhs_stride);
@@ -227,7 +228,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
         pack_data_scales_interleave_block<UInt4, Float16>(ref_rhs_qsu4.data(), ref_rhs_scales.data(), N, K, bl);
 
     const auto imp_packed_rhs_size = ukernel_variant.pack_interface.rhs_packed_size(N, K, nr, kr, bl);
-    std::vector<uint8_t> imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size);
     const auto rhs_start_row = rect.start_col();
     auto rhs_packed_offset = ukernel_variant.pack_interface.get_rhs_packed_offset(rhs_start_row, K, nr, kr, bl);
     auto rhs_matmul_offset = ukernel_variant.ukernel.interface.get_rhs_packed_offset(rhs_start_row, K, bl);
@@ -235,7 +236,8 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
 
     const kai_rhs_pack_qs4cxs1s0_param params{.lhs_zero_point = 1, .rhs_zero_point = 8};
     ukernel_variant.pack_interface.rhs_pack(
-        1, N, K, nr, kr, sr, bl, ref_rhs_qsu4_scale_f16.data(), nullptr, imp_packed_rhs.data(), 0, &params);
+        1, N, K, nr, kr, sr, bl, reinterpret_cast<const uint8_t*>(ref_rhs_qsu4_scale_f16.data()), nullptr,
+        imp_packed_rhs.data(), 0, &params);
 
     const auto dst_stride_row = N * sizeof(float);
     const auto dst_stride_col = sizeof(float);
@@ -247,7 +249,7 @@ TEST_P(MatMulTest_f32_qsi8d32p_qsi4c32p, EndToEnd) {
     // Runs the GEMM micro-kernel.
     const auto imp_dst_size = ukernel_variant.ukernel.interface.get_dst_size(M, N);
     ASSERT_EQ(imp_dst_size, ref_dst.size());
-    std::vector<uint8_t> imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size);
     ukernel_variant.ukernel.interface.run_matmul(
         rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset,
         imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast<float*>(imp_dst.data() + dst_offset),
diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
index e3c1cb91..3dacdf78 100644
--- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
@@ -30,6 +30,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_pack_x8p2vlx4_x8_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme.h"
+#include "test/common/buffer.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/matmul_test_common.hpp"
 #include "test/common/matrix_portion.hpp"
@@ -51,7 +52,6 @@ namespace kai::test {
 // Ensure static linkage for all functionality local to this test file
 namespace {
 
-using Buffer = std::vector<uint8_t>;
 using IndirectionBuffer = std::vector<uint8_t*>;
 
 struct KChunk {
@@ -458,7 +458,7 @@ const TestReference& get_test_reference(const TestDataId& test_data_id) {
             const size_t idx = m_i * k_chunk_count + k_chunk_idx;
             if (pad_testing and m_i == 0) {
                 // Push padding pointers for first row
-                lhs_qai8_indirect[idx] = lhs_padding.data();
+                lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(lhs_padding.data());
             } else {
                 uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len;
                 lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(offset);
@@ -591,11 +591,14 @@ void test_lhs_pack(
               output_area.end_row(), shape.k, variant.acc_pack.m, variant.acc_pack.k, 1)
         : imp_packed_lhs_size;
 
+    const auto* imp_packed_lhs_ptr = reinterpret_cast<const uint8_t*>(imp_packed_lhs.data());
+    const auto* ref_packed_lhs_ptr = reinterpret_cast<const uint8_t*>(reference.packed_lhs.data());
+
     for (size_t i = 0; i < reference.packed_lhs.size(); ++i) {
         if (i >= imp_packed_lhs_offset && i < imp_packed_lhs_end_offset) {
-            ASSERT_EQ(imp_packed_lhs[i], reference.packed_lhs[i]);
+            ASSERT_EQ(imp_packed_lhs_ptr[i], ref_packed_lhs_ptr[i]);
         } else {
-            ASSERT_EQ(imp_packed_lhs[i], 0);
+            ASSERT_EQ(imp_packed_lhs_ptr[i], 0);
         }
     }
 }
@@ -627,13 +630,16 @@ void test_rhs_pack(
         : imp_packed_rhs_size;
 
     size_t mismatches = 0;
+    const auto* imp_packed_rhs_ptr = reinterpret_cast<const uint8_t*>(imp_packed_rhs.data());
+    const auto* ref_packed_rhs_ptr = reinterpret_cast<const uint8_t*>(reference.packed_rhs.data());
+
     for (size_t i = 0; i < reference.packed_rhs.size(); ++i) {
         if (i >= imp_packed_rhs_offset && i < imp_packed_rhs_end_offset) {
-            if (imp_packed_rhs[i] != reference.packed_rhs[i]) {
+            if (imp_packed_rhs_ptr[i] != ref_packed_rhs_ptr[i]) {
                 mismatches += 1;
             }
         } else {
-            if (imp_packed_rhs[i] != 0) {
+            if (imp_packed_rhs_ptr[i] != 0) {
                 mismatches += 1;
             }
         }
diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp
index dbae9718..3c41541c 100644
--- a/test/tests/matmul_test.cpp
+++ b/test/tests/matmul_test.cpp
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "kai/kai_common.h"
+#include "test/common/buffer.hpp"
 #include "test/common/compare.hpp"
 #include "test/common/cpu_info.hpp"
 #include "test/common/data_format.hpp"
@@ -265,16 +266,16 @@ private:
 protected:
     /// Cached test data that is shared between multiple test case.
     struct TestData {
-        std::vector<uint8_t> lhs{};             ///< LHS operand.
-        std::vector<uint8_t> ref_packed_lhs{};  ///< Reference packed LHS.
-        std::vector<uint8_t> rhs{};             ///< RHS operand.
-        std::vector<uint8_t> rhs_scales{};      ///< RHS per-row quantization scales.
-        std::vector<uint8_t> bias{};            ///< Bias.
-        std::vector<uint8_t> rhs_t{};           ///< Transposed RHS matrix.
-        std::vector<uint8_t> ref_packed_rhs{};  ///< Reference packed RHS.
-        std::vector<uint8_t> ref_dst{};         ///< Reference output.
-        float clamp_min{};                      ///< Minimum output value.
-        float clamp_max{};                      ///< Maximum output value.
+        Buffer lhs{};             ///< LHS operand.
+        Buffer ref_packed_lhs{};  ///< Reference packed LHS.
+        Buffer rhs{};             ///< RHS operand.
+        Buffer rhs_scales{};      ///< RHS per-row quantization scales.
+        Buffer bias{};            ///< Bias.
+        Buffer rhs_t{};           ///< Transposed RHS matrix.
+        Buffer ref_packed_rhs{};  ///< Reference packed RHS.
+        Buffer ref_dst{};         ///< Reference output.
+        float clamp_min{};        ///< Minimum output value.
+        float clamp_max{};        ///< Maximum output value.
     };
 
     /// Gets the test data for the current test case.
@@ -297,7 +298,7 @@ protected:
         const auto lhs_h = info.m;
         const auto lhs_w = info.k;
         auto lhs = fill_matrix_random(lhs_h, lhs_w, method.lhs_format, 0);
-        std::vector<uint8_t> ref_packed_lhs;
+        Buffer ref_packed_lhs;
 
         if (has_lhs_pack) {
             ref_packed_lhs =
@@ -311,7 +312,7 @@ protected:
         KAI_ASSUME(method.rhs_format.is_raw());
         auto rhs_t = transpose(rhs.data(), method.rhs_format.data_type(), rhs_h, rhs_w);
 
-        std::vector<uint8_t> rhs_scales;
+        Buffer rhs_scales;
         if (data_type_is_quantized(method.rhs_format.data_type()) &&
             method.rhs_format.pack_format() == DataFormat::PackFormat::NONE) {
             rhs_scales = fill_matrix_random(rhs_h, 1, DataFormat(DataType::FP32), 2);
@@ -319,17 +320,17 @@ protected:
 
         const auto bias_h = 1;
         const auto bias_w = info.n;
-        std::vector<uint8_t> bias;
+        Buffer bias;
 
         if (has_bias) {
             bias = fill_matrix_random(bias_h, bias_w, method.bias_format, 3);
         }
 
-        std::vector<uint8_t> packed_rhs;
+        Buffer packed_rhs;
         if (has_rhs_pack) {
             packed_rhs = matmul_pack_rhs(
-                rhs.data(), !rhs_scales.empty() ? rhs_scales.data() : nullptr, bias.data(), method.rhs_format,
-                method.packed_rhs_format, info.n, info.k, true);
+                rhs.data(), rhs_scales.data(), bias.data(), method.rhs_format, method.packed_rhs_format, info.n, info.k,
+                true);
         }
 
         KAI_ASSUME(method.lhs_format.is_raw());
@@ -439,8 +440,7 @@ TEST_P(MatMulTest, PackedLhs) {
     const auto ref_packed_lhs_offset = method.packed_lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w);
     ASSERT_EQ(packed_lhs_offset, ref_packed_lhs_offset);
 
-    std::vector<uint8_t> packed_lhs;
-    packed_lhs.resize(packed_lhs_size);
+    Buffer packed_lhs(packed_lhs_size);
     method.fn_pack_lhs(
         rect.height(), rect.width(), mr, kr, sr, 0, data.lhs.data() + lhs_offset, ref_lhs_row_stride,
         packed_lhs.data() + packed_lhs_offset);
@@ -509,10 +509,10 @@ TEST_P(MatMulTest, PackedRhs) {
     ASSERT_EQ(bias_offset, ref_bias_offset);
 
     /** Perform RHS packing, and compare with reference result **/
-    std::vector<uint8_t> packed_rhs(packed_rhs_size, 0);
+    Buffer packed_rhs(packed_rhs_size, 0);
     method.pack_rhs(
         height, width, data.rhs.data() + rhs_offset, rhs_row_stride, data.bias.data() + bias_offset,
-        !data.rhs_scales.empty() ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr,
+        data.rhs_scales.data() != nullptr ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr,
         packed_rhs.data() + packed_rhs_offset);
 
     const bool exact = method.packed_rhs_format.pack_format() != DataFormat::PackFormat::QUANTIZE_PER_ROW;
@@ -570,12 +570,11 @@ TEST_P(MatMulTest, PackedTransposedRhs) {
     const auto ref_bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), info.n);
     ASSERT_EQ(bias_offset, ref_bias_offset);
 
-    std::vector<uint8_t> packed_rhs;
-    packed_rhs.resize(packed_rhs_size);
+    Buffer packed_rhs(packed_rhs_size);
 
     method.pack_rhs_nxk(
         rect.height(), rect.width(), data.rhs_t.data() + rhs_offset, ref_rhs_row_stride, data.bias.data() + bias_offset,
-        !data.rhs_scales.empty() ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr,
+        data.rhs_scales.data() != nullptr ? data.rhs_scales.data() + ref_rhs_scales_offset : nullptr,
         packed_rhs.data() + packed_rhs_offset);
 
     const auto exact = method.packed_rhs_format.pack_format() != DataFormat::PackFormat::QUANTIZE_PER_ROW;
@@ -619,7 +618,7 @@ TEST_P(MatMulTest, Output) {
     const auto lhs_start_col = 0;
     const auto lhs_stride = method.lhs_format.default_row_stride(lhs_w);
 
-    const uint8_t* lhs_data = nullptr;
+    const std::byte* lhs_data = nullptr;
     uintptr_t lhs_offset = 0;
 
     if (method.is_pack_lhs_needed()) {
@@ -639,7 +638,7 @@ TEST_P(MatMulTest, Output) {
 
     const auto rhs_stride = method.rhs_format.default_row_stride(rhs_w);
 
-    const uint8_t* rhs_data = nullptr;
+    const std::byte* rhs_data = nullptr;
     uintptr_t rhs_offset = 0;
 
     if (method.is_pack_rhs_needed()) {
@@ -672,8 +671,7 @@ TEST_P(MatMulTest, Output) {
     const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n);
     ASSERT_EQ(dst_size, ref_dst_size);
 
-    std::vector<uint8_t> dst;
-    dst.resize(dst_size);
+    Buffer dst(dst_size);
 
     method.main_kernel(
         rect.height(), rect.width(), info.k, lhs_data + lhs_offset, rhs_data + rhs_offset, bias_data + bias_offset,
-- 
GitLab


From d4334fe689aa39e0a6fcc5bdfd5c41694b5cc9f4 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Wed, 7 May 2025 10:56:04 +0100
Subject: [PATCH 2/3] Remove unused headers and update Buffer class

* Buffer class now by default doesn't initialize the data buffer.
* There is a separate constructor to initialize the buffer with
  user-provided value.

Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
---
 test/common/buffer.cpp                        |  4 +++-
 test/common/buffer.hpp                        |  3 ++-
 test/common/int4.hpp                          |  1 -
 test/reference/binary_elementwise.cpp         |  1 -
 test/reference/binary_elementwise.hpp         |  2 --
 test/reference/cast.cpp                       |  1 -
 test/reference/cast.hpp                       |  2 --
 test/reference/clamp.cpp                      |  2 --
 test/reference/clamp.hpp                      |  2 --
 test/reference/fill.cpp                       |  1 -
 test/reference/fill.hpp                       |  1 -
 test/reference/matmul.cpp                     |  1 -
 test/reference/matmul.hpp                     |  1 -
 test/reference/matmul_pack.cpp                |  2 --
 test/reference/matmul_pack.hpp                |  2 --
 test/reference/pack.cpp                       |  1 -
 test/reference/pack.hpp                       |  2 --
 test/reference/pad.cpp                        |  1 -
 test/reference/pad.hpp                        |  1 -
 test/reference/quantize.cpp                   |  1 -
 test/reference/quantize.hpp                   |  1 -
 test/reference/reduce.cpp                     |  1 -
 test/reference/reduce.hpp                     |  1 -
 test/reference/reorder.cpp                    |  2 +-
 test/reference/reorder.hpp                    |  2 --
 test/reference/transpose.cpp                  |  1 -
 test/reference/transpose.hpp                  |  2 --
 .../matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp |  1 -
 .../matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp |  1 -
 ...atmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp |  1 -
 .../matmul_clamp_f32_bf16p_bf16p_test.cpp     |  1 -
 test/tests/matmul_clamp_f32_f32_f32p_test.cpp |  1 -
 ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp |  1 -
 .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp |  1 -
 .../matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp |  1 -
 ...atmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp |  1 -
 ...atmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp |  1 -
 .../matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp  | 21 ++++++++-----------
 test/tests/matmul_test.cpp                    |  7 +++----
 39 files changed, 18 insertions(+), 62 deletions(-)

diff --git a/test/common/buffer.cpp b/test/common/buffer.cpp
index 3945eba2..86343d0d 100644
--- a/test/common/buffer.cpp
+++ b/test/common/buffer.cpp
@@ -21,7 +21,7 @@
 
 namespace kai::test {
 
-Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size) {
+Buffer::Buffer(const size_t size) : m_user_buffer_size(size) {
     KAI_ASSUME_MSG(size > 0, "Buffers must be of non-zero size");
 
     const char* val = getenv("KAI_TEST_BUFFER_POLICY");
@@ -57,7 +57,9 @@ Buffer::Buffer(const size_t size, uint8_t init_value) : m_user_buffer_size(size)
         default:
             allocate();
     }
+}
 
+Buffer::Buffer(const size_t size, uint8_t init_value) : Buffer(size) {
     memset(data(), init_value, size);
 }
 
diff --git a/test/common/buffer.hpp b/test/common/buffer.hpp
index 4cb06e4b..8224f1f0 100644
--- a/test/common/buffer.hpp
+++ b/test/common/buffer.hpp
@@ -30,7 +30,8 @@ class Buffer {
 
 public:
     Buffer() = default;
-    Buffer(size_t size, uint8_t init_value = 0);
+    explicit Buffer(size_t size);
+    Buffer(size_t size, uint8_t init_value);
 
     Buffer(const Buffer& other) = delete;
     Buffer(Buffer&& other) noexcept = default;
diff --git a/test/common/int4.hpp b/test/common/int4.hpp
index aa05d9bd..73d031d5 100644
--- a/test/common/int4.hpp
+++ b/test/common/int4.hpp
@@ -8,7 +8,6 @@
 
 #include <cstdint>
 #include <tuple>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/binary_elementwise.cpp b/test/reference/binary_elementwise.cpp
index 48434e47..8212195e 100644
--- a/test/reference/binary_elementwise.cpp
+++ b/test/reference/binary_elementwise.cpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/binary_elementwise.hpp b/test/reference/binary_elementwise.hpp
index 713f8692..d66d7e36 100644
--- a/test/reference/binary_elementwise.hpp
+++ b/test/reference/binary_elementwise.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/reference/cast.cpp b/test/reference/cast.cpp
index e11cb350..d6728926 100644
--- a/test/reference/cast.cpp
+++ b/test/reference/cast.cpp
@@ -8,7 +8,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
diff --git a/test/reference/cast.hpp b/test/reference/cast.hpp
index 8dc09b22..ce9c06c5 100644
--- a/test/reference/cast.hpp
+++ b/test/reference/cast.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/reference/clamp.cpp b/test/reference/clamp.cpp
index eadc755e..c797d3a9 100644
--- a/test/reference/clamp.cpp
+++ b/test/reference/clamp.cpp
@@ -8,8 +8,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/clamp.hpp b/test/reference/clamp.hpp
index 52ca57ba..8dacee33 100644
--- a/test/reference/clamp.hpp
+++ b/test/reference/clamp.hpp
@@ -7,9 +7,7 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
 #include <tuple>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/reference/fill.cpp b/test/reference/fill.cpp
index 82068fdb..0e155340 100644
--- a/test/reference/fill.cpp
+++ b/test/reference/fill.cpp
@@ -11,7 +11,6 @@
 #include <functional>
 #include <random>
 #include <type_traits>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
diff --git a/test/reference/fill.hpp b/test/reference/fill.hpp
index 9dd0f26c..29c9cf3b 100644
--- a/test/reference/fill.hpp
+++ b/test/reference/fill.hpp
@@ -8,7 +8,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp
index be5bb6a6..81613212 100644
--- a/test/reference/matmul.cpp
+++ b/test/reference/matmul.cpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp
index d349d0b4..8ef06490 100644
--- a/test/reference/matmul.hpp
+++ b/test/reference/matmul.hpp
@@ -8,7 +8,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/reference/matmul_pack.cpp b/test/reference/matmul_pack.cpp
index 40139e09..973cf9b5 100644
--- a/test/reference/matmul_pack.cpp
+++ b/test/reference/matmul_pack.cpp
@@ -7,8 +7,6 @@
 #include "test/reference/matmul_pack.hpp"
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/round.hpp"
diff --git a/test/reference/matmul_pack.hpp b/test/reference/matmul_pack.hpp
index ea713dd7..5bf57659 100644
--- a/test/reference/matmul_pack.hpp
+++ b/test/reference/matmul_pack.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/pack.cpp b/test/reference/pack.cpp
index e06e3303..cb193917 100644
--- a/test/reference/pack.cpp
+++ b/test/reference/pack.cpp
@@ -12,7 +12,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/bfloat16.hpp"
diff --git a/test/reference/pack.hpp b/test/reference/pack.hpp
index b424d5ba..29180290 100644
--- a/test/reference/pack.hpp
+++ b/test/reference/pack.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp
index 182f6dc8..e857ce74 100644
--- a/test/reference/pad.cpp
+++ b/test/reference/pad.cpp
@@ -9,7 +9,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp
index 74c0229a..c21b8c9a 100644
--- a/test/reference/pad.hpp
+++ b/test/reference/pad.hpp
@@ -8,7 +8,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp
index c64d8404..008f6676 100644
--- a/test/reference/quantize.cpp
+++ b/test/reference/quantize.cpp
@@ -11,7 +11,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
-#include <vector>
 
 #include "test/common/bfloat16.hpp"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp
index e0c7d1f0..d0aa3bcd 100644
--- a/test/reference/quantize.hpp
+++ b/test/reference/quantize.hpp
@@ -9,7 +9,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <tuple>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/reduce.cpp b/test/reference/reduce.cpp
index 7045668d..085ac063 100644
--- a/test/reference/reduce.cpp
+++ b/test/reference/reduce.cpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/reduce.hpp b/test/reference/reduce.hpp
index 8341f442..8f1ccea7 100644
--- a/test/reference/reduce.hpp
+++ b/test/reference/reduce.hpp
@@ -8,7 +8,6 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/reorder.cpp b/test/reference/reorder.cpp
index 2ab8eed2..12078730 100644
--- a/test/reference/reorder.cpp
+++ b/test/reference/reorder.cpp
@@ -21,7 +21,7 @@ Buffer reorder_block(const void* src, size_t height, size_t width, size_t block_
     const auto num_dst_elements = round_up_multiple(height, block_height) * round_up_multiple(width, block_width);
     const auto dst_size = round_up_division(num_dst_elements * size_in_bits<T>, 8);
 
-    Buffer dst(dst_size);
+    Buffer dst(dst_size, 0);
     size_t dst_index = 0;
 
     for (size_t y_block = 0; y_block < height; y_block += block_height) {
diff --git a/test/reference/reorder.hpp b/test/reference/reorder.hpp
index 514c2ea4..8453ce1e 100644
--- a/test/reference/reorder.hpp
+++ b/test/reference/reorder.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 
diff --git a/test/reference/transpose.cpp b/test/reference/transpose.cpp
index 1a3dd8c1..7c8fbe9e 100644
--- a/test/reference/transpose.cpp
+++ b/test/reference/transpose.cpp
@@ -9,7 +9,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/reference/transpose.hpp b/test/reference/transpose.hpp
index 11f031aa..f6f8c343 100644
--- a/test/reference/transpose.hpp
+++ b/test/reference/transpose.hpp
@@ -7,8 +7,6 @@
 #pragma once
 
 #include <cstddef>
-#include <cstdint>
-#include <vector>
 
 #include "test/common/buffer.hpp"
 #include "test/common/data_type.hpp"
diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
index 795abc06..5b529876 100644
--- a/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi4cxp_test.cpp
@@ -13,7 +13,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi4cxp4x4_1x4_neon_dotprod.h"
 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi4cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi4cxp4x8_1x4_neon_dotprod.h"
diff --git a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
index 3f59f645..3a1710f8 100644
--- a/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f16_qai8dxp_qsi8cxp_test.cpp
@@ -14,7 +14,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
 #include "kai/ukernels/matmul/matmul_clamp_f16_qai8dxp_qsi8cxp/kai_matmul_clamp_f16_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
diff --git a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
index 14838981..5ba1a687 100644
--- a/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f16_qsi8d32p_qai4c32p_test.cpp
@@ -13,7 +13,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
 #include "kai/ukernels/matmul/matmul_clamp_f16_qsi8d32p_qai4c32p/kai_matmul_clamp_f16_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
index 5dd2f3b9..de0f4ffb 100644
--- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
+++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
@@ -16,7 +16,6 @@
 #include <string_view>
 #include <tuple>
 #include <utility>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
index 00774b1e..7e811c75 100644
--- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
@@ -15,7 +15,6 @@
 #include <random>
 #include <sstream>
 #include <type_traits>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p16vlx1b_1x16vl_sme2_mla.h"
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
index 65304569..b191fd4e 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp
@@ -17,7 +17,6 @@
 #include <string_view>
 #include <tuple>
 #include <utility>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4c32p/kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
index 6459c87b..dd6a27a2 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp
@@ -15,7 +15,6 @@
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1vlx8_qsi4cxp4vlx8_1vlx4vl_sme2_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi4cxp4vlx4_1x4vl_sme2_sdot.h"
diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
index 64a079cd..1bad32a6 100644
--- a/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi8cxp_test.cpp
@@ -14,7 +14,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
index da00f5cc..699ced2a 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qai4c32p_test.cpp
@@ -13,7 +13,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qai4c32p4x8_1x4_neon_dotprod.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qai4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qai4c32p4x8_8x4_neon_i8mm.h"
diff --git a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
index 4d82a9d7..24326307 100644
--- a/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_qsi8d32p_qsi4c32p_test.cpp
@@ -14,7 +14,6 @@
 #include <sstream>
 #include <string>
 #include <tuple>
-#include <vector>
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
diff --git a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
index 3dacdf78..62d544ad 100644
--- a/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
+++ b/test/tests/matmul_clamp_qai8_qai8p_qsi8cxp_test.cpp
@@ -17,7 +17,6 @@
 #include <string_view>
 #include <tuple>
 #include <unordered_map>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "kai/ukernels/matmul/imatmul_clamp_qai8_qai8p_qsi8cxp/kai_imatmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme2_mopa.h"
@@ -52,8 +51,6 @@ namespace kai::test {
 // Ensure static linkage for all functionality local to this test file
 namespace {
 
-using IndirectionBuffer = std::vector<uint8_t*>;
-
 struct KChunk {
     size_t count;
     size_t length;
@@ -361,7 +358,7 @@ struct TestReference {
     Buffer lhs_qai8;
     Buffer lhs_qai8_scales;
     Buffer lhs_qai8_zero_points;
-    IndirectionBuffer lhs_qai8_indirect;
+    Buffer lhs_qai8_indirect;
     Buffer lhs_qai8_indirect_packed;
     Buffer lhs_qai8_indirect_padding;
     size_t lhs_qai8_indirect_offset;
@@ -451,17 +448,18 @@ const TestReference& get_test_reference(const TestDataId& test_data_id) {
 
     // Setup an indirection buffer, where each "row" contains `k_chunk_count`
     // pointers to chunks of length `k_chunk_len` in the input_buffer
-    IndirectionBuffer lhs_qai8_indirect(shape.m * k_chunk_count);
+    Buffer lhs_qai8_indirect(shape.m * k_chunk_count * sizeof(void*));
     Buffer lhs_padding(k_chunk_len, padding_value);
+    auto* lhs_qai8_indirect_ptr = reinterpret_cast<uint8_t**>(lhs_qai8_indirect.data());
     for (size_t m_i = 0; m_i < shape.m; ++m_i) {
         for (size_t k_chunk_idx = 0; k_chunk_idx < k_chunk_count; ++k_chunk_idx) {
             const size_t idx = m_i * k_chunk_count + k_chunk_idx;
             if (pad_testing and m_i == 0) {
                 // Push padding pointers for first row
-                lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(lhs_padding.data());
+                lhs_qai8_indirect_ptr[idx] = reinterpret_cast<uint8_t*>(lhs_padding.data());
             } else {
                 uintptr_t offset = m_i * shape.k + k_chunk_idx * k_chunk_len;
-                lhs_qai8_indirect[idx] = reinterpret_cast<uint8_t*>(offset);
+                lhs_qai8_indirect_ptr[idx] = reinterpret_cast<uint8_t*>(offset);
             }
         }
     }
@@ -576,7 +574,7 @@ void test_lhs_pack(
         variant.lhs_pack->get_packed_lhs_size(shape.m, shape.k, variant.acc_pack.m, variant.acc_pack.k, 1);
     ASSERT_EQ(imp_packed_lhs_size, reference.packed_lhs.size());
 
-    Buffer imp_packed_lhs(imp_packed_lhs_size);
+    Buffer imp_packed_lhs(imp_packed_lhs_size, 0);
     const auto imp_lhs_offset = variant.lhs_pack->get_lhs_offset(output_area.start_row(), shape.k * sizeof(int8_t));
     const auto imp_packed_lhs_offset = variant.lhs_pack->get_packed_lhs_offset(
         output_area.start_row(), shape.k, variant.acc_pack.m, variant.acc_pack.k, 1);
@@ -608,7 +606,7 @@ void test_rhs_pack(
     const MatMulShape& shape, const MatMulVariant& variant, const Rect& output_area, const TestReference& reference) {
     const auto imp_packed_rhs_size = variant.rhs_pack.get_packed_rhs_size(shape.n, shape.k);
     ASSERT_EQ(imp_packed_rhs_size, reference.packed_rhs.size());
-    Buffer imp_packed_rhs(imp_packed_rhs_size);
+    Buffer imp_packed_rhs(imp_packed_rhs_size, 0);
 
     const auto imp_rhs_offset = variant.rhs_pack.get_rhs_offset(output_area.start_col());
     const auto imp_bias_offset = variant.rhs_pack.get_bias_offset(output_area.start_col());
@@ -686,7 +684,7 @@ void test_matmul(
     const auto imp_dst_size = variant.matmul.get_dst_size(shape.m, shape.n);
     ASSERT_EQ(imp_dst_size, reference.dst_qsi8_clamped.size());
 
-    Buffer imp_dst(imp_dst_size);
+    Buffer imp_dst(imp_dst_size, 0);
     const auto [imp_lhs_offset, lhs_data] = [&]() -> std::tuple<size_t, const Buffer&> {
         if (variant.lhs_pack.has_value()) {
             return {variant.matmul.get_packed_lhs_offset(output_area.start_row(), shape.k), reference.packed_lhs};
@@ -814,7 +812,6 @@ static Buffer rhs_pack(
     const KChunk& k_chunk) {
     // Allocate output buffer
     const size_t dst_size = variant.get_packed_rhs_size(n, k_chunk.count, k_chunk.length);
-    Buffer packed_all(dst_size);
     Buffer packed(dst_size);
 
     // Caluclate effective quantization parameters
@@ -853,7 +850,7 @@ static Buffer matmul(
 
     // Allocate output buffer
     const size_t dst_size = variant.get_dst_size(shape.m, shape.n);
-    Buffer dst(dst_size);
+    Buffer dst(dst_size, 0);
 
     // Calculate geffective uantization parameters
     kai_matmul_requantize32_params requantization{};
diff --git a/test/tests/matmul_test.cpp b/test/tests/matmul_test.cpp
index 3c41541c..c828f56d 100644
--- a/test/tests/matmul_test.cpp
+++ b/test/tests/matmul_test.cpp
@@ -16,7 +16,6 @@
 #include <string_view>
 #include <tuple>
 #include <utility>
-#include <vector>
 
 #include "kai/kai_common.h"
 #include "test/common/buffer.hpp"
@@ -440,7 +439,7 @@ TEST_P(MatMulTest, PackedLhs) {
     const auto ref_packed_lhs_offset = method.packed_lhs_format.default_offset_in_bytes(rect.start_row(), 0, lhs_w);
     ASSERT_EQ(packed_lhs_offset, ref_packed_lhs_offset);
 
-    Buffer packed_lhs(packed_lhs_size);
+    Buffer packed_lhs(packed_lhs_size, 0);
     method.fn_pack_lhs(
         rect.height(), rect.width(), mr, kr, sr, 0, data.lhs.data() + lhs_offset, ref_lhs_row_stride,
         packed_lhs.data() + packed_lhs_offset);
@@ -570,7 +569,7 @@ TEST_P(MatMulTest, PackedTransposedRhs) {
     const auto ref_bias_offset = method.bias_format.default_offset_in_bytes(0, rect.start_row(), info.n);
     ASSERT_EQ(bias_offset, ref_bias_offset);
 
-    Buffer packed_rhs(packed_rhs_size);
+    Buffer packed_rhs(packed_rhs_size, 0);
 
     method.pack_rhs_nxk(
         rect.height(), rect.width(), data.rhs_t.data() + rhs_offset, ref_rhs_row_stride, data.bias.data() + bias_offset,
@@ -671,7 +670,7 @@ TEST_P(MatMulTest, Output) {
     const auto ref_dst_size = method.dst_format.default_size_in_bytes(info.m, info.n);
     ASSERT_EQ(dst_size, ref_dst_size);
 
-    Buffer dst(dst_size);
+    Buffer dst(dst_size, 0);
 
     method.main_kernel(
         rect.height(), rect.width(), info.k, lhs_data + lhs_offset, rhs_data + rhs_offset, bias_data + bias_offset,
-- 
GitLab


From b417754c7b2ec0e51056363add75e5912344446c Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Mon, 19 May 2025 09:42:52 +0100
Subject: [PATCH 3/3] Address review comments

Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
---
 test/reference/pad.cpp                        | 14 ++++----
 test/reference/pad.hpp                        |  2 +-
 .../matmul_clamp_f32_bf16p_bf16p_test.cpp     | 20 ++++++-----
 test/tests/matmul_clamp_f32_f32_f32p_test.cpp | 35 +------------------
 4 files changed, 21 insertions(+), 50 deletions(-)

diff --git a/test/reference/pad.cpp b/test/reference/pad.cpp
index e857ce74..4a19555f 100644
--- a/test/reference/pad.cpp
+++ b/test/reference/pad.cpp
@@ -50,13 +50,15 @@ Buffer pad_matrix(
 
     Buffer dst(dst_size);
 
-    for (size_t y = 0; y < dst_height; ++y) {
-        for (size_t x = 0; x < dst_width; ++x) {
-            if (y >= pad_top && y < pad_top + height && x >= pad_left && x < pad_left + width) {
-                const T value = read_array<T>(data, (y - pad_top) * width + x - pad_left);
-                write_array<T>(dst.data(), y * dst_width + x, value);
+    for (size_t row = 0; row < dst_height; ++row) {
+        for (size_t col = 0; col < dst_width; ++col) {
+            const bool valid_row = row >= pad_top && row < pad_top + height;
+            const bool valid_col = col >= pad_left && col < pad_left + width;
+            if (valid_row && valid_col) {
+                const T value = read_array<T>(data, (row - pad_top) * width + col - pad_left);
+                write_array<T>(dst.data(), row * dst_width + col, value);
             } else {
-                write_array<T>(dst.data(), y * dst_width + x, pad_value);
+                write_array<T>(dst.data(), row * dst_width + col, pad_value);
             }
         }
     }
diff --git a/test/reference/pad.hpp b/test/reference/pad.hpp
index c21b8c9a..29089919 100644
--- a/test/reference/pad.hpp
+++ b/test/reference/pad.hpp
@@ -32,7 +32,7 @@ Buffer pad_row(
     const void* data, size_t height, size_t width, size_t src_stride, size_t dst_stride, size_t dst_size,
     uint8_t val = 0);
 
-/// Pads the matrix with value.
+/// Creates a padded matrix from an input matrix.
 ///
 /// @param[in] data The input data buffer.
 /// @param[in] height The number of input rows.
diff --git a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
index de0f4ffb..76ea30b7 100644
--- a/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
+++ b/test/tests/matmul_clamp_f32_bf16p_bf16p_test.cpp
@@ -379,15 +379,18 @@ protected:
         constexpr size_t nr = 12;
         constexpr size_t kr = 4;
 
-        Buffer packed_rhs;
+        size_t packed_rhs_size = 0;
+
         if (method.fn_get_packed_rhs_size) {
-            packed_rhs = Buffer(method.fn_get_packed_rhs_size(rhs_w, rhs_h));
+            packed_rhs_size = method.fn_get_packed_rhs_size(rhs_w, rhs_h);
         } else if (method.fn_get_packed_rhs_size_generic_block_size) {
-            packed_rhs = Buffer(method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr));
+            packed_rhs_size = method.fn_get_packed_rhs_size_generic_block_size(rhs_w, rhs_h, nr, kr);
         } else {
             KAI_ERROR("No function to calculate Packed Rhs Matrix Size");
         }
 
+        Buffer packed_rhs(packed_rhs_size);
+
         if (has_rhs_pack) {
             const auto ref_rhs_row_stride = method.rhs_format.default_row_stride(rhs_w);
             method.pack_rhs(
@@ -475,17 +478,16 @@ TEST_P(MatMulTestBf16, Output) {
 
     const auto rhs_stride = method.rhs_format.default_row_stride(info.n);
 
-    Buffer rhs_data;
+    size_t rhs_packed_size = 0;
 
     if (method.fn_get_packed_rhs_size_generic_block_size) {
-        const size_t rhs_packed_size =
-            method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0);
-        rhs_data = Buffer(rhs_packed_size);
+        rhs_packed_size = method.fn_get_packed_rhs_size_generic_block_size(info.n, info.k, method.n0, method.k0);
     } else if (method.fn_get_packed_rhs_size) {
-        const size_t rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k);
-        rhs_data = Buffer(rhs_packed_size);
+        rhs_packed_size = method.fn_get_packed_rhs_size(info.n, info.k);
     }
 
+    Buffer rhs_data(rhs_packed_size);
+
     const auto packed_rhs_start_row = rect.start_col();
     const auto packed_rhs_start_col = 0;
 
diff --git a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
index 7e811c75..3b37518a 100644
--- a/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
+++ b/test/tests/matmul_clamp_f32_f32_f32p_test.cpp
@@ -30,6 +30,7 @@
 #include "test/common/memory.hpp"
 #include "test/common/test_suite.hpp"
 #include "test/reference/clamp.hpp"
+#include "test/reference/fill.hpp"
 #include "test/reference/matmul.hpp"
 
 namespace kai::test {
@@ -63,40 +64,6 @@ const std::array<UkernelVariant<kai_matmul_clamp_f32_f32_f32p_ukernel>, 2> ukern
       "matmul_clamp_f32_f32_f32p2vlx1b_1x16vl_sme2_mla",
       cpu_has_sme2}}};
 
-// TODO: Reimplement these helpers in fill.cpp. These methods are currently duplicated here so they can be specialized
-// on the Buffer return type.
-template <typename T>
-Buffer fill_matrix_raw(size_t height, size_t width, std::function<T(size_t, size_t)> gen) {
-    const auto size = height * width * size_in_bits<T> / 8;
-    KAI_ASSUME(width * size_in_bits<T> % 8 == 0);
-
-    Buffer data(size);
-    auto ptr = reinterpret_cast<T*>(data.data());
-
-    for (size_t y = 0; y < height; ++y) {
-        for (size_t x = 0; x < width; ++x) {
-            write_array<T>(ptr, y * width + x, gen(y, x));
-        }
-    }
-
-    return data;
-}
-
-template <typename T>
-Buffer fill_matrix_random_raw(size_t height, size_t width, uint32_t seed) {
-    using TDist = std::conditional_t<
-        std::is_floating_point_v<T>, std::uniform_real_distribution<float>, std::uniform_int_distribution<T>>;
-
-    std::mt19937 rnd(seed);
-    TDist dist;
-
-    return fill_matrix_raw<T>(height, width, [&](size_t, size_t) { return dist(rnd); });
-}
-
-template <typename Value>
-Buffer fill_random(size_t length, uint32_t seed) {
-    return fill_matrix_random_raw<Value>(1, length, seed);
-}
 }  // namespace
 
 class MatMulTest_f32_f32_f32p : public ::testing::TestWithParam<MatMulTestPortionedParams> {};
-- 
GitLab