From f5dee582fea8a008e9572fc4e38efb6bad4fc7bd Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Wed, 16 Jul 2025 13:46:59 +0100 Subject: [PATCH 1/6] Refactor matmul test to reduce code duplication Signed-off-by: Evie Wright --- test/reference/matmul.cpp | 81 +++ test/reference/matmul.hpp | 11 + ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 522 +++++++----------- 3 files changed, 292 insertions(+), 322 deletions(-) diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index 81613212..9eba8b95 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -378,6 +378,87 @@ template Buffer matmul_nt_t_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + +template < + typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, + typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> +Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, // + size_t lhs_quant_height, size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, // + size_t rhs_quant_height, size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, // + size_t bias_quant_width) { + const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width); + const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width); + + Buffer dst(m * n * sizeof(DstData)); + + for (size_t row = 0; row < m; ++row) { + for (size_t col = 0; col < n; ++col) { + DstData acc = 0; + + for (size_t i = 0; i < k; ++i) { + const auto lhs_data_index = row * k + i; + const auto lhs_quant_index = (row / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width; + const auto lhs_value = read_array(lhs_data, lhs_data_index); + const auto lhs_scale = lhs_scales != nullptr ? read_array(lhs_scales, lhs_quant_index) + : static_cast(1); + const auto lhs_zero_point = lhs_zero_points != nullptr + ? read_array(lhs_zero_points, lhs_quant_index) + : static_cast(0); + + const auto rhs_data_index = col + i * n; + const auto rhs_quant_index = (col / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width; + const auto rhs_value = read_array(rhs_data, rhs_data_index); + const auto rhs_scale = rhs_scales != nullptr ? read_array(rhs_scales, rhs_quant_index) + : static_cast(1); + const auto rhs_zero_point = rhs_zero_points != nullptr + ? read_array(rhs_zero_points, rhs_quant_index) + : static_cast(0); + + acc += (static_cast(lhs_value) - static_cast(lhs_zero_point)) * + static_cast(lhs_scale) * + (static_cast(rhs_value) - static_cast(rhs_zero_point)) * + static_cast(rhs_scale); + } + + if (bias_data != nullptr) { + const auto bias_value = read_array(bias_data, col); + const auto bias_scale = bias_scales != nullptr + ? read_array(bias_scales, col / bias_quant_width) + : static_cast(1); + const auto bias_zero_point = bias_zero_points != nullptr + ? read_array(bias_zero_points, col / bias_quant_width) + : static_cast(0); + + acc += (static_cast(bias_value) - static_cast(bias_zero_point)) * + static_cast(bias_scale); + } + + write_array(dst.data(), row * n + col, acc); + } + } + + return dst; +} + +template Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + template Buffer indirect_matmul_nt_t_quantized( size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length, // diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp index 8ef06490..673b1310 100644 --- a/test/reference/matmul.hpp +++ b/test/reference/matmul.hpp @@ -192,6 +192,17 @@ Buffer matmul_nt_t_quantized( size_t rhs_quant_width, // const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); +template < + typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, + typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> +Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index be8e2b71..226abc21 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -47,6 +47,7 @@ #include "test/common/round.hpp" #include "test/common/test_suite.hpp" #include "test/reference/cast.hpp" +#include "test/reference/clamp.hpp" #include "test/reference/fill.hpp" #include "test/reference/matmul.hpp" #include "test/reference/pad.hpp" @@ -54,6 +55,9 @@ #include "test/reference/transpose.hpp" namespace kai::test { + +enum class RhsPackType { NxK, KxN }; + static const std::array, 11> variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p = { {{UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod), @@ -79,279 +83,157 @@ static const std::array; - -class MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p : public ::testing::TestWithParam {}; - -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_RHS) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); - - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; - } - - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - auto m_step = ukernel_variant.interface.get_m_step(); - auto n_step = ukernel_variant.interface.get_n_step(); - - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; +// Reference quantization and packing => Int4 per channel +// * Generates signed (used for reference matmul) and unsigned (used to test micro-kernel matmul) packed matrices +// * Generates reference scales from input RHS matrix +static inline std::tuple ref_pack_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer* ref_rhs, RhsPackType pack_type) { + auto [ref_rhs_qsi4, ref_scales] = + quantize_symmetric_per_block_dynamic((*ref_rhs).data(), N, K, bl); + if (pack_type == RhsPackType::KxN) { + // Non-Transposed(kxn) RHS dimensions + const size_t ref_rhs_qsi4_kxn_stride = round_up_multiple(N, 2); + const size_t ref_rhs_qsi4_kxn_size = K * ref_rhs_qsi4_kxn_stride; + const size_t ref_rhs_qsi4_kxn_size_bytes = round_up_division(ref_rhs_qsi4_kxn_size, 2); + + auto ref_rhs_qsi4_kxn = transpose_with_padding( + ref_rhs_qsi4.data(), N, K, K, ref_rhs_qsi4_kxn_stride, ref_rhs_qsi4_kxn_size_bytes); + + const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4_kxn.data(), ref_rhs_qsi4_kxn_size); + auto ref_rhs_qsu4_padded = pad_row( + ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); + + return {std::move(ref_rhs_qsi4_kxn), std::move(ref_rhs_qsu4_padded), std::move(ref_scales)}; + } else { + const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), N * K); + auto ref_rhs_qsu4_padded = pad_row( + ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); + + return {std::move(ref_rhs_qsi4), std::move(ref_rhs_qsu4_padded), std::move(ref_scales)}; } - - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - - const auto rhs_start_row = rect.start_col(); - auto rhs_packed_offset_kxn = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_packed_offset_nxk = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - - ASSERT_EQ(rhs_packed_offset_kxn, rhs_packed_offset_nxk); - - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset_kxn, rhs_matmul_offset); } -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_LHS) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); +// Runs the (scalar) RHS packing micro-kernel. +static inline std::tuple imp_pack_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, + const Buffer* ref_biases, const Buffer* ref_scales, RhsPackType pack_type, size_t start_row, size_t rect_width) { + const size_t ref_rhs_qsu4_stride = + pack_type == RhsPackType::NxK ? round_up_division(K, 2) : round_up_division(N, 2); + const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; + size_t rhs_offset, rhs_packed_offset; + if (pack_type == RhsPackType::KxN) { + rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(start_row, ref_rhs_qsu4_stride); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(start_row, K, nr, kr, sr, bl, scale_dt); + } else { + rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(start_row, ref_rhs_qsu4_stride); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(start_row, K, nr, kr, sr, bl, scale_dt); } - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - auto m_step = ukernel_variant.interface.get_m_step(); - auto n_step = ukernel_variant.interface.get_n_step(); + size_t bias_offset = start_row * sizeof(float); + size_t scale_offset = start_row * ref_rhs_scales_stride; - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; + size_t imp_packed_rhs_size; + if (pack_type == RhsPackType::KxN) { + imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); + } else { + imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); } - const auto mr = ukernel_variant.interface.get_mr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - - const auto lhs_start_row = rect.start_row(); - auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); - - ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); -} - -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); - - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; - } - - constexpr uint32_t seed = 0; - - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - const auto mr = ukernel_variant.interface.get_mr(); - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - - // Generates input data. - const auto ref_lhs = fill_random(M * K, seed + 0); - const auto ref_rhs = fill_random(N * K, seed + 1); - const auto ref_biases = fill_random(N, seed + 2); - kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - - // Runs the reference implementation. - // * Quantizes the LHS matrix using 8-bit asymmetric quantization. - // * Quantizes the RHS matrix using 4-bit symmetric quantization. - // * Performs GEMM. - const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = - quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - const auto [ref_rhs_qsi4, ref_rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); - - const auto ref_dst = matmul_clamp_nt_t( - M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), K, ref_rhs_qsi4.data(), - ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), - std::numeric_limits::max()); - - auto m_step = ukernel_variant.interface.get_m_step(); - ASSERT_TRUE(m_step % mr == 0); - - auto n_step = ukernel_variant.interface.get_n_step(); - ASSERT_TRUE(n_step % nr == 0); + Buffer imp_packed_rhs(imp_packed_rhs_size); - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; + if (pack_type == RhsPackType::KxN) { + kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; + params.lhs_zero_point = 1; + params.rhs_zero_point = 8; + params.scale_dt = scale_dt; + + kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset), + ref_rhs_qsu4_stride, reinterpret_cast((*ref_biases).data() + bias_offset), + (*ref_scales).data() + scale_offset, ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, + ¶ms); + } else { + kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; + params.lhs_zero_point = 1; + params.rhs_zero_point = 8; + params.scale_dt = scale_dt; + + kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset), + ref_rhs_qsu4_stride, reinterpret_cast((*ref_biases).data() + bias_offset), + (*ref_scales).data() + scale_offset, ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, + ¶ms); } + return {std::move(imp_packed_rhs), rhs_packed_offset}; +} - const auto lhs_start_row = rect.start_row(); - size_t lhs_stride = K * sizeof(float); - - // Runs the LHS packing micro-kernel. - const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - Buffer imp_packed_lhs(imp_packed_lhs_size); - - auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); - auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); - ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); - - kai_run_lhs_quant_pack_qai8dxp_f32( - rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, - reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, - imp_packed_lhs.data() + lhs_packed_offset); - - // Runs the RHS packing micro-kernel. - // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. - // * Packs the RHS matrix. - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), N * K); - const auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); +static inline std::tuple imp_pack_rhs_qsi4c32p_neon( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, + const Buffer* ref_biases, const Buffer* ref_scales, size_t start_row, size_t rect_width) { + KAI_ASSUME(kr / sr == 8 || kr / sr == 4); const size_t ref_rhs_qsu4_stride = round_up_division(K, 2); const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + size_t bias_offset = start_row * sizeof(float); + size_t scale_offset = start_row * ref_rhs_scales_stride; - const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - Buffer imp_packed_rhs(imp_packed_rhs_size); - - const auto rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - - auto rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); - size_t bias_offset = rhs_start_row * sizeof(float); - size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; + size_t imp_packed_rhs_size_neon, rhs_packed_offset_neon, rhs_offset_neon; + if (kr / sr == 8) { + imp_packed_rhs_size_neon = + kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); + rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( + start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(start_row, ref_rhs_qsu4_stride); + } else { + imp_packed_rhs_size_neon = + kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); + rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( + start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(start_row, ref_rhs_qsu4_stride); + } + Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 8; - params.scale_dt = kai_datatype::kai_dt_bf16; - - kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, - imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); - - const auto dst_stride = N * sizeof(float); - const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); - const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); - ASSERT_EQ(dst_offset, ref_dst_offset); - - const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); - ASSERT_EQ(imp_dst_size, ref_dst.size()); + params.scale_dt = scale_dt; - // Runs the GEMM micro-kernel. - Buffer imp_dst(imp_dst_size); if (kr / sr == 8) { - // Test that vectorized packing kernel for nrx8 gives same output as scalar - const auto imp_packed_rhs_size_neon = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(imp_packed_rhs_size_neon, imp_packed_rhs_size); - - Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); - - auto rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( - rhs_start_row, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); - - auto rhs_offset_neon = - kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(rhs_start_row, ref_rhs_qsu4_stride); - kai_run_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, + 1, rect_width /* n */, K, nr, kr, sr, bl, + reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset_neon), ref_rhs_qsu4_stride, + reinterpret_cast((*ref_biases).data() + bias_offset), + reinterpret_cast((*ref_scales).data() + scale_offset), ref_rhs_scales_stride, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); - - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); - } else if (kr / sr == 4) { - // Test that vectorized packing kernel for nrx4 gives same output as scalar - const auto imp_packed_rhs_size_neon = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(imp_packed_rhs_size_neon, imp_packed_rhs_size); - - Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); - - auto rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( - rhs_start_row, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); - - auto rhs_offset_neon = - kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(rhs_start_row, ref_rhs_qsu4_stride); - + } else { kai_run_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, + 1, rect_width /* n */, K, nr, kr, sr, bl, + reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset_neon), ref_rhs_qsu4_stride, + reinterpret_cast((*ref_biases).data() + bias_offset), + reinterpret_cast((*ref_scales).data() + scale_offset), ref_rhs_scales_stride, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); - - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); } - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - - // Compares the output of the micro-kernels against the output of the reference implementation for the portion - // tested. - // Compares the output of the micro-kernels against the output of the reference implementation for the portion - // tested. - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); + return {std::move(imp_packed_rhs_neon), rhs_packed_offset_neon}; } -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); +using MatMulTestParams_withBL_withRHSPackType = std::tuple; + +class MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p + : public ::testing::TestWithParam {}; + +TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { + const auto& [variant_index, matmul_shape, bl, portion, rhs_pack_type] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; + GTEST_SKIP() << "CPU features are not supported by current CPU"; } - const uint32_t seed = 0; + const std::uint32_t seed = 0; const size_t M = matmul_shape.m; const size_t N = matmul_shape.n; @@ -362,38 +244,6 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const auto kr = ukernel_variant.interface.get_kr(); const auto sr = ukernel_variant.interface.get_sr(); - // Generates input data. - const auto ref_lhs = fill_random(M * K, seed + 0); - const auto ref_rhs_transposed = fill_random(N * K, seed + 1); - const auto ref_biases = fill_random(N, seed + 2); - kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - - // Transposed(nxk) RHS dimensions - const size_t ref_rhs_qsi4_nxk_stride = K; - - // Non-Transposed(kxn) RHS dimensions - const size_t ref_rhs_qsi4_kxn_stride = round_up_multiple(N, 2); - const size_t ref_rhs_qsi4_kxn_size = K * ref_rhs_qsi4_kxn_stride; - const size_t ref_rhs_qsi4_kxn_size_bytes = round_up_division(ref_rhs_qsi4_kxn_size, 2); - - // Runs the reference implementation. - // * Quantizes the LHS matrix using 8-bit asymmetric quantization. - // * Quantizes the RHS matrix using 4-bit symmetric quantization. - // * Performs GEMM. - const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = - quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - const auto [ref_rhs_qsi4_transposed, ref_rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs_transposed.data(), N, K, bl); - - auto ref_rhs_qsi4 = transpose_with_padding( - ref_rhs_qsi4_transposed.data(), N, K, ref_rhs_qsi4_nxk_stride, ref_rhs_qsi4_kxn_stride, - ref_rhs_qsi4_kxn_size_bytes); - - const auto ref_dst = matmul_clamp_nt_nt( - M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), K, ref_rhs_qsi4.data(), - ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), - std::numeric_limits::max()); - auto m_step = ukernel_variant.interface.get_m_step(); ASSERT_TRUE(m_step % mr == 0); @@ -405,81 +255,108 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; } - const auto lhs_start_row = rect.start_row(); - size_t lhs_stride = K * sizeof(float); + // Generates input data. + const auto ref_lhs = fill_random(M * K, seed + 0); + const auto ref_rhs = fill_random(N * K, seed + 1); + const auto ref_biases = fill_random(N, seed + 2); + kai_datatype scale_dt = kai_datatype::kai_dt_bf16; + + // Runs the reference implementation. + // * Quantizes the LHS matrix using 8-bit symmetric quantization. + // * Quantizes the RHS matrix using 8-bit asymmetric quantization. + // * Performs GEMM. + const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = + quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); + const auto [ref_rhs_qsi4, ref_rhs_qsu4, ref_rhs_scales] = ref_pack_rhs_qsi4c32p(N, K, bl, &ref_rhs, rhs_pack_type); + + Buffer ref_dst_noclamp; + + if (rhs_pack_type == RhsPackType::NxK) { + ref_dst_noclamp = + matmul_nt_t_quantized( + M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, + ref_rhs_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, 1); + } else { + ref_dst_noclamp = + matmul_nt_nt_quantized( + M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, + ref_rhs_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, 1); + } + + // Clamps the reference output. + const auto clamp_ratio = 0.8F; + const auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); + const auto ref_dst = clamp(ref_dst_noclamp.data(), M * N, clamp_min, clamp_max); // Runs the LHS packing micro-kernel. + const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - Buffer imp_packed_lhs(imp_packed_lhs_size); + Buffer imp_packed_lhs = Buffer(imp_packed_lhs_size); + + auto lhs_stride = K * sizeof(float); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); + ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( - rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, - reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, - imp_packed_lhs.data() + lhs_packed_offset); - - // Runs the RHS packing micro-kernel. - // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. - // * Packs the RHS matrix. - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), ref_rhs_qsi4_kxn_size); - const auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); - const size_t ref_rhs_qsu4_stride = round_up_division(N, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + rect.height() /* m */, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data() + lhs_offset), + lhs_stride, reinterpret_cast(imp_packed_lhs.data()) + lhs_packed_offset); - const auto rhs_start_row = rect.start_col(); - auto rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); + const size_t rhs_start_row = rect.start_col(); - size_t bias_offset = rhs_start_row * sizeof(float); - size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; + auto [imp_packed_rhs, rhs_packed_offset] = imp_pack_rhs_qsi4c32p( + N, K, bl, nr, kr, sr, scale_dt, &ref_rhs_qsu4, &ref_biases, &ref_rhs_scales, rhs_pack_type, rhs_start_row, + rect.width()); - const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - Buffer imp_packed_rhs(imp_packed_rhs_size); - - kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; - params.lhs_zero_point = 1; - params.rhs_zero_point = 8; - params.scale_dt = kai_datatype::kai_dt_bf16; - - kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), ref_rhs_scales.data() + scale_offset, - ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - const auto dst_stride = N * sizeof(float); - const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); - const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + const auto dst_stride_row = N * sizeof(float); + const auto dst_stride_col = sizeof(float); + const auto dst_offset = + ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride_row); + const auto ref_dst_offset = rect.start_row() * dst_stride_row + rect.start_col() * dst_stride_col; ASSERT_EQ(dst_offset, ref_dst_offset); // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - Buffer imp_dst(imp_dst_size); + Buffer imp_dst = Buffer(imp_dst_size); + ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); + rect.height(), rect.width(), K, bl, reinterpret_cast(imp_packed_lhs.data()) + lhs_matmul_offset, + reinterpret_cast(imp_packed_rhs.data()) + rhs_matmul_offset, + reinterpret_cast(imp_dst.data() + dst_offset), dst_stride_row, dst_stride_col, clamp_min, clamp_max); // Compares the output of the micro-kernels against the output of the reference implementation for the portion // tested. DefaultMismatchHandler handler(0, 0.1, 0, 0.05); DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); + const auto success = + compare(reinterpret_cast(imp_dst.data()), ref_dst.data(), dst_format, M, N, rect, handler); ASSERT_TRUE(success); + + // Test that using vectorized packing gives same output as scalar + if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { + const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32p_neon( + N, K, bl, nr, kr, sr, scale_dt, &ref_rhs_qsu4, &ref_biases, &ref_rhs_scales, rhs_start_row, rect.width()); + ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); + + ukernel_variant.interface.run_matmul( + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), + dst_stride_row, dst_stride_col, clamp_min, clamp_max); + + const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); + ASSERT_TRUE(success); + } } INSTANTIATE_TEST_SUITE_P( - MatMul, MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, + MatMul, MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, testing::Combine( testing::Range(0, variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.size()), testing::Values( @@ -490,20 +367,21 @@ INSTANTIATE_TEST_SUITE_P( MatMulShape{1, 25, 64}), testing::Values(32, 64), testing::Values( - MatrixPortion(0, 0, 1, 1), // Full matrix. - MatrixPortion(0, 0, 1, 0.25f), // Leftmost portion. - MatrixPortion(0, 0.75f, 1, 1), // Rightmost portion. - MatrixPortion(0, 0.5f, 1, 0.8f) // Somewhere Middle - )), + MatrixPortion(0, 0, 1, 1), // Full matrix. + MatrixPortion(0, 0, 1, 0.25f), // Leftmost portion. + MatrixPortion(0, 0.75f, 1, 1), // Rightmost portion. + MatrixPortion(0, 0.5f, 1, 0.8f)), // Somewhere Middle + testing::Values(RhsPackType::NxK, RhsPackType::KxN)), [](const auto& info) { const auto variant_idx = std::get<0>(info.param); const std::string name{variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_idx).name}; const auto shape = std::get(info.param); const auto bl = std::get<2>(info.param); const auto portion = std::get<3>(info.param); + const RhsPackType rhs_pack_type = std::get<4>(info.param); std::ostringstream sstream; - sstream << name << "__"; + sstream << name << ((rhs_pack_type == RhsPackType::NxK) ? "__NxK" : "__KxN") << "__"; PrintTo(shape, &sstream); sstream << "__BL_" << bl << "__"; PrintTo(portion, &sstream); -- GitLab From f01c004a957eb145b2e5e6902d24c97a75046472 Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Wed, 16 Jul 2025 15:05:35 +0100 Subject: [PATCH 2/6] clean up comments Signed-off-by: Evie Wright --- test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 226abc21..75249df2 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -83,7 +82,7 @@ static const std::array Int4 per channel +// Reference quantization and packing => Int4 per-block. // * Generates signed (used for reference matmul) and unsigned (used to test micro-kernel matmul) packed matrices // * Generates reference scales from input RHS matrix static inline std::tuple ref_pack_rhs_qsi4c32p( @@ -113,7 +112,7 @@ static inline std::tuple ref_pack_rhs_qsi4c32p( } } -// Runs the (scalar) RHS packing micro-kernel. +// Executes the scalar RHS packing micro-kernel. static inline std::tuple imp_pack_rhs_qsi4c32p( size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, const Buffer* ref_biases, const Buffer* ref_scales, RhsPackType pack_type, size_t start_row, size_t rect_width) { @@ -170,6 +169,7 @@ static inline std::tuple imp_pack_rhs_qsi4c32p( return {std::move(imp_packed_rhs), rhs_packed_offset}; } +// Executes the vectorized RHS packing micro-kernels for block length of 4 bytes or 8 bytes static inline std::tuple imp_pack_rhs_qsi4c32p_neon( size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, const Buffer* ref_biases, const Buffer* ref_scales, size_t start_row, size_t rect_width) { @@ -339,7 +339,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { compare(reinterpret_cast(imp_dst.data()), ref_dst.data(), dst_format, M, N, rect, handler); ASSERT_TRUE(success); - // Test that using vectorized packing gives same output as scalar + // Test vectorized packing functions, if packing parameters allow if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32p_neon( N, K, bl, nr, kr, sr, scale_dt, &ref_rhs_qsu4, &ref_biases, &ref_rhs_scales, rhs_start_row, rect.width()); -- GitLab From ed433c21201555b69b770e82e41ff200dc147e3d Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Fri, 18 Jul 2025 12:12:01 +0100 Subject: [PATCH 3/6] address code review comments: better-fit variable naming and refactoring, general code cleanup Signed-off-by: Evie Wright --- ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 197 +++++++++--------- 1 file changed, 98 insertions(+), 99 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 75249df2..d3afc58f 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -83,66 +83,57 @@ static const std::array Int4 per-block. -// * Generates signed (used for reference matmul) and unsigned (used to test micro-kernel matmul) packed matrices +// * Generates signed values for reference matmul // * Generates reference scales from input RHS matrix -static inline std::tuple ref_pack_rhs_qsi4c32p( - size_t N, size_t K, size_t bl, const Buffer* ref_rhs, RhsPackType pack_type) { - auto [ref_rhs_qsi4, ref_scales] = - quantize_symmetric_per_block_dynamic((*ref_rhs).data(), N, K, bl); - if (pack_type == RhsPackType::KxN) { - // Non-Transposed(kxn) RHS dimensions - const size_t ref_rhs_qsi4_kxn_stride = round_up_multiple(N, 2); - const size_t ref_rhs_qsi4_kxn_size = K * ref_rhs_qsi4_kxn_stride; - const size_t ref_rhs_qsi4_kxn_size_bytes = round_up_division(ref_rhs_qsi4_kxn_size, 2); - - auto ref_rhs_qsi4_kxn = transpose_with_padding( - ref_rhs_qsi4.data(), N, K, K, ref_rhs_qsi4_kxn_stride, ref_rhs_qsi4_kxn_size_bytes); +template +static inline std::tuple ref_quant_pack_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& ref_rhs, RhsPackType pack_type) { + auto [rhs_values_qsi4, rhs_scales] = + quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4_kxn.data(), ref_rhs_qsi4_kxn_size); - auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); + size_t width = pack_type == RhsPackType::KxN ? N : K; + size_t height = pack_type == RhsPackType::KxN ? K : N; - return {std::move(ref_rhs_qsi4_kxn), std::move(ref_rhs_qsu4_padded), std::move(ref_scales)}; - } else { - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), N * K); - auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); + size_t qsi4_stride = round_up_multiple(width, 2); + size_t qsi4_stride_bytes = round_up_division(width, 2); + size_t qsi4_size_bytes = round_up_division(height * qsi4_stride, 2); + size_t scales_stride_bytes = round_up_division(K, bl) * sizeof(ScaleType); - return {std::move(ref_rhs_qsi4), std::move(ref_rhs_qsu4_padded), std::move(ref_scales)}; + if (pack_type == RhsPackType::KxN) { + rhs_values_qsi4 = transpose_with_padding(rhs_values_qsi4.data(), N, K, K, qsi4_stride, qsi4_size_bytes); } + + return {std::move(rhs_values_qsi4), std::move(rhs_scales), qsi4_stride_bytes, qsi4_size_bytes, scales_stride_bytes}; } // Executes the scalar RHS packing micro-kernel. -static inline std::tuple imp_pack_rhs_qsi4c32p( - size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, - const Buffer* ref_biases, const Buffer* ref_scales, RhsPackType pack_type, size_t start_row, size_t rect_width) { - const size_t ref_rhs_qsu4_stride = - pack_type == RhsPackType::NxK ? round_up_division(K, 2) : round_up_division(N, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - - size_t rhs_offset, rhs_packed_offset; - if (pack_type == RhsPackType::KxN) { - rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(start_row, ref_rhs_qsu4_stride); - rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(start_row, K, nr, kr, sr, bl, scale_dt); - } else { - rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(start_row, ref_rhs_qsu4_stride); - rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(start_row, K, nr, kr, sr, bl, scale_dt); - } - - size_t bias_offset = start_row * sizeof(float); - size_t scale_offset = start_row * ref_rhs_scales_stride; +static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, + size_t rhs_stride_bytes, size_t rhs_size_bytes, const Buffer& ref_biases, size_t bias_offset, + const Buffer& ref_scales, size_t scales_stride_bytes, RhsPackType pack_type, size_t rect_start_row, + size_t rect_width) { + size_t width = pack_type == RhsPackType::KxN ? N : K; + size_t height = pack_type == RhsPackType::KxN ? K : N; + + auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_size_bytes * sr); + auto rhs_qsu4 = pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * sr, rhs_size_bytes); + kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - size_t imp_packed_rhs_size; + size_t scale_offset = rect_start_row * scales_stride_bytes; + size_t rhs_offset, rhs_packed_offset, imp_packed_rhs_size; if (pack_type == RhsPackType::KxN) { + rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rect_start_row, rhs_stride_bytes); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rect_start_row, K, nr, kr, sr, bl, scale_dt); imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); } else { + rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rect_start_row, rhs_stride_bytes); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rect_start_row, K, nr, kr, sr, bl, scale_dt); imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); } Buffer imp_packed_rhs(imp_packed_rhs_size); - if (pack_type == RhsPackType::KxN) { kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; @@ -150,9 +141,9 @@ static inline std::tuple imp_pack_rhs_qsi4c32p( params.scale_dt = scale_dt; kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( - 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset), - ref_rhs_qsu4_stride, reinterpret_cast((*ref_biases).data() + bias_offset), - (*ref_scales).data() + scale_offset, ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), + rhs_stride_bytes, reinterpret_cast(ref_biases.data() + bias_offset), + ref_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); } else { kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; @@ -161,62 +152,68 @@ static inline std::tuple imp_pack_rhs_qsi4c32p( params.scale_dt = scale_dt; kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( - 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset), - ref_rhs_qsu4_stride, reinterpret_cast((*ref_biases).data() + bias_offset), - (*ref_scales).data() + scale_offset, ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), + rhs_stride_bytes, reinterpret_cast(ref_biases.data() + bias_offset), + ref_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); } return {std::move(imp_packed_rhs), rhs_packed_offset}; } // Executes the vectorized RHS packing micro-kernels for block length of 4 bytes or 8 bytes -static inline std::tuple imp_pack_rhs_qsi4c32p_neon( - size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, kai_datatype scale_dt, const Buffer* ref_rhs_qsu4, - const Buffer* ref_biases, const Buffer* ref_scales, size_t start_row, size_t rect_width) { +static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16_neon( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, + size_t rhs_stride_bytes, size_t rhs_size_bytes, const Buffer& ref_biases, size_t bias_offset, + const Buffer& ref_scales, size_t scales_stride_bytes, RhsPackType pack_type, size_t rect_start_row, + size_t rect_width) { KAI_ASSUME(kr / sr == 8 || kr / sr == 4); + size_t width = pack_type == RhsPackType::KxN ? N : K; + size_t height = pack_type == RhsPackType::KxN ? K : N; - const size_t ref_rhs_qsu4_stride = round_up_division(K, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - size_t bias_offset = start_row * sizeof(float); - size_t scale_offset = start_row * ref_rhs_scales_stride; + auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_size_bytes * sr); + auto rhs_qsu4 = pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * sr, rhs_size_bytes); + kai_datatype scale_dt = kai_datatype::kai_dt_bf16; + + size_t scale_offset = rect_start_row * scales_stride_bytes; size_t imp_packed_rhs_size_neon, rhs_packed_offset_neon, rhs_offset_neon; if (kr / sr == 8) { imp_packed_rhs_size_neon = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( - start_row, K, nr, kr, sr, bl, scale_dt); - rhs_offset_neon = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(start_row, ref_rhs_qsu4_stride); + rect_start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = + kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(rect_start_row, rhs_stride_bytes); } else { imp_packed_rhs_size_neon = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( - start_row, K, nr, kr, sr, bl, scale_dt); - rhs_offset_neon = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(start_row, ref_rhs_qsu4_stride); + rect_start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = + kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(rect_start_row, rhs_stride_bytes); } - Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 8; params.scale_dt = scale_dt; + Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); if (kr / sr == 8) { kai_run_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( 1, rect_width /* n */, K, nr, kr, sr, bl, - reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast((*ref_biases).data() + bias_offset), - reinterpret_cast((*ref_scales).data() + scale_offset), ref_rhs_scales_stride, + reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, + reinterpret_cast(ref_biases.data() + bias_offset), + reinterpret_cast(ref_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); } else { kai_run_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( 1, rect_width /* n */, K, nr, kr, sr, bl, - reinterpret_cast((*ref_rhs_qsu4).data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast((*ref_biases).data() + bias_offset), - reinterpret_cast((*ref_scales).data() + scale_offset), ref_rhs_scales_stride, + reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, + reinterpret_cast(ref_biases.data() + bias_offset), + reinterpret_cast(ref_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); } - return {std::move(imp_packed_rhs_neon), rhs_packed_offset_neon}; } @@ -226,23 +223,23 @@ class MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p : public ::testing::TestWithParam {}; TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { - const auto& [variant_index, matmul_shape, bl, portion, rhs_pack_type] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); + auto& [variant_index, matmul_shape, bl, portion, rhs_pack_type] = GetParam(); + auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "CPU features are not supported by current CPU"; + GTEST_SKIP() << "Unsupported CPU feature"; } - const std::uint32_t seed = 0; + const uint32_t seed = 0; - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; + size_t M = matmul_shape.m; + size_t N = matmul_shape.n; + size_t K = matmul_shape.k; - const auto mr = ukernel_variant.interface.get_mr(); - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); + auto mr = ukernel_variant.interface.get_mr(); + auto nr = ukernel_variant.interface.get_nr(); + auto kr = ukernel_variant.interface.get_kr(); + auto sr = ukernel_variant.interface.get_sr(); auto m_step = ukernel_variant.interface.get_m_step(); ASSERT_TRUE(m_step % mr == 0); @@ -250,7 +247,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { auto n_step = ukernel_variant.interface.get_n_step(); ASSERT_TRUE(n_step % nr == 0); - const auto rect = portion.compute_portion(M, N, m_step, n_step); + auto rect = portion.compute_portion(M, N, m_step, n_step); if (rect.height() == 0 || rect.width() == 0) { GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; } @@ -259,57 +256,58 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { const auto ref_lhs = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); const auto ref_biases = fill_random(N, seed + 2); - kai_datatype scale_dt = kai_datatype::kai_dt_bf16; // Runs the reference implementation. // * Quantizes the LHS matrix using 8-bit symmetric quantization. // * Quantizes the RHS matrix using 8-bit asymmetric quantization. // * Performs GEMM. - const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = + auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - const auto [ref_rhs_qsi4, ref_rhs_qsu4, ref_rhs_scales] = ref_pack_rhs_qsi4c32p(N, K, bl, &ref_rhs, rhs_pack_type); + auto [ref_rhs_values_qsi4, ref_rhs_scales, rhs_stride_bytes, rhs_size_bytes, rhs_scales_stride_bytes] = + ref_quant_pack_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type); Buffer ref_dst_noclamp; - if (rhs_pack_type == RhsPackType::NxK) { ref_dst_noclamp = matmul_nt_t_quantized( M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, - ref_rhs_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, 1); + ref_rhs_values_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, + 1); } else { ref_dst_noclamp = matmul_nt_nt_quantized( M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, - ref_rhs_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, 1); + ref_rhs_values_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, + 1); } // Clamps the reference output. - const auto clamp_ratio = 0.8F; - const auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); - const auto ref_dst = clamp(ref_dst_noclamp.data(), M * N, clamp_min, clamp_max); + auto clamp_ratio = 0.8F; + auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); + auto ref_dst = clamp(ref_dst_noclamp.data(), M * N, clamp_min, clamp_max); // Runs the LHS packing micro-kernel. const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - Buffer imp_packed_lhs = Buffer(imp_packed_lhs_size); + Buffer imp_packed_lhs(imp_packed_lhs_size); auto lhs_stride = K * sizeof(float); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); - ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( rect.height() /* m */, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, reinterpret_cast(imp_packed_lhs.data()) + lhs_packed_offset); - const size_t rhs_start_row = rect.start_col(); + const auto rhs_start_row = rect.start_col(); + size_t bias_offset = rhs_start_row * sizeof(float); - auto [imp_packed_rhs, rhs_packed_offset] = imp_pack_rhs_qsi4c32p( - N, K, bl, nr, kr, sr, scale_dt, &ref_rhs_qsu4, &ref_biases, &ref_rhs_scales, rhs_pack_type, rhs_start_row, - rect.width()); + auto [imp_packed_rhs, rhs_packed_offset] = imp_pack_rhs_qsi4c32pscalebf16( + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, rhs_stride_bytes, rhs_size_bytes, ref_biases, bias_offset, + ref_rhs_scales, rhs_scales_stride_bytes, rhs_pack_type, rhs_start_row, rect.width()); auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); @@ -324,7 +322,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); - Buffer imp_dst = Buffer(imp_dst_size); + Buffer imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( rect.height(), rect.width(), K, bl, reinterpret_cast(imp_packed_lhs.data()) + lhs_matmul_offset, @@ -341,8 +339,9 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { // Test vectorized packing functions, if packing parameters allow if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { - const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32p_neon( - N, K, bl, nr, kr, sr, scale_dt, &ref_rhs_qsu4, &ref_biases, &ref_rhs_scales, rhs_start_row, rect.width()); + const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32pscalebf16_neon( + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, rhs_stride_bytes, rhs_size_bytes, ref_biases, bias_offset, + ref_rhs_scales, rhs_scales_stride_bytes, rhs_pack_type, rhs_start_row, rect.width()); ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); ukernel_variant.interface.run_matmul( -- GitLab From f71c66ac364197b40f3a2772d04a6a5419ebdd04 Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Fri, 18 Jul 2025 16:10:37 +0100 Subject: [PATCH 4/6] address final comments, move reference function to another file Signed-off-by: Evie Wright --- test/reference/quantize.cpp | 25 +++++ test/reference/quantize.hpp | 4 + ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 92 ++++++++----------- 3 files changed, 69 insertions(+), 52 deletions(-) diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index 008f6676..ed59a2e5 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -20,6 +20,7 @@ #include "test/common/round.hpp" #include "test/common/type_traits.hpp" #include "test/reference/cast.hpp" +#include "test/reference/transpose.hpp" namespace kai::test { @@ -293,4 +294,28 @@ template std::tuple quantize_asymmetric_per_block_dynami const void* src, size_t height, size_t width, size_t quant_width); template std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); + +// Reference quantization and packing => Int4 per-block. +// * Generates signed values for reference matmul +// * Generates reference scales from input RHS matrix +template +inline std::tuple ref_quant_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed) { + auto [rhs_values_qsi4, rhs_scales] = + quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); + + const size_t width = transposed ? K : N; + const size_t height = transposed ? N : K; + + const size_t qsi4_stride = round_up_multiple(width, 2); + const size_t qsi4_size_bytes = round_up_division(height * qsi4_stride, 2); + + if (!transposed) { + rhs_values_qsi4 = transpose_with_padding(rhs_values_qsi4.data(), N, K, K, qsi4_stride, qsi4_size_bytes); + } + + return {std::move(rhs_values_qsi4), std::move(rhs_scales)}; +} +template std::tuple ref_quant_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed); } // namespace kai::test diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index d0aa3bcd..6e0ecb03 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -358,4 +358,8 @@ template quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); +/// Reference quantization template for rhs packing, Int4 per-block. +template +std::tuple ref_quant_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed); } // namespace kai::test diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index d3afc58f..64e648e2 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -82,44 +82,26 @@ static const std::array Int4 per-block. -// * Generates signed values for reference matmul -// * Generates reference scales from input RHS matrix -template -static inline std::tuple ref_quant_pack_rhs_qsi4c32p( - size_t N, size_t K, size_t bl, const Buffer& ref_rhs, RhsPackType pack_type) { - auto [rhs_values_qsi4, rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); - - size_t width = pack_type == RhsPackType::KxN ? N : K; - size_t height = pack_type == RhsPackType::KxN ? K : N; - - size_t qsi4_stride = round_up_multiple(width, 2); - size_t qsi4_stride_bytes = round_up_division(width, 2); - size_t qsi4_size_bytes = round_up_division(height * qsi4_stride, 2); - size_t scales_stride_bytes = round_up_division(K, bl) * sizeof(ScaleType); - - if (pack_type == RhsPackType::KxN) { - rhs_values_qsi4 = transpose_with_padding(rhs_values_qsi4.data(), N, K, K, qsi4_stride, qsi4_size_bytes); - } - - return {std::move(rhs_values_qsi4), std::move(rhs_scales), qsi4_stride_bytes, qsi4_size_bytes, scales_stride_bytes}; -} - // Executes the scalar RHS packing micro-kernel. -static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16( +static inline std::tuple pack_rhs_qsi4c32pscalebf16( size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, - size_t rhs_stride_bytes, size_t rhs_size_bytes, const Buffer& ref_biases, size_t bias_offset, - const Buffer& ref_scales, size_t scales_stride_bytes, RhsPackType pack_type, size_t rect_start_row, - size_t rect_width) { - size_t width = pack_type == RhsPackType::KxN ? N : K; - size_t height = pack_type == RhsPackType::KxN ? K : N; - - auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_size_bytes * sr); - auto rhs_qsu4 = pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * sr, rhs_size_bytes); + const Buffer& ref_biases, size_t bias_offset, const Buffer& ref_scales, RhsPackType pack_type, + size_t rect_start_row, size_t rect_width) { + const size_t width = pack_type == RhsPackType::KxN ? N : K; + const size_t height = pack_type == RhsPackType::KxN ? K : N; kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - size_t scale_offset = rect_start_row * scales_stride_bytes; + const size_t rhs_stride = round_up_multiple(width, 2); + const size_t rhs_stride_bytes = round_up_division(width, 2); + const size_t scales_stride_bytes = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + + KAI_ASSUME(rhs_values_qsi4.size() == round_up_division(height * rhs_stride, 2)); + + const auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_values_qsi4.size() * 2); + auto rhs_qsu4 = + pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * 2, rhs_values_qsi4.size()); + + const size_t scale_offset = rect_start_row * scales_stride_bytes; size_t rhs_offset, rhs_packed_offset, imp_packed_rhs_size; if (pack_type == RhsPackType::KxN) { rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rect_start_row, rhs_stride_bytes); @@ -163,17 +145,23 @@ static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16( // Executes the vectorized RHS packing micro-kernels for block length of 4 bytes or 8 bytes static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16_neon( size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, - size_t rhs_stride_bytes, size_t rhs_size_bytes, const Buffer& ref_biases, size_t bias_offset, - const Buffer& ref_scales, size_t scales_stride_bytes, RhsPackType pack_type, size_t rect_start_row, - size_t rect_width) { + const Buffer& ref_biases, size_t bias_offset, const Buffer& ref_scales, RhsPackType pack_type, + size_t rect_start_row, size_t rect_width) { KAI_ASSUME(kr / sr == 8 || kr / sr == 4); - size_t width = pack_type == RhsPackType::KxN ? N : K; - size_t height = pack_type == RhsPackType::KxN ? K : N; - - auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_size_bytes * sr); - auto rhs_qsu4 = pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * sr, rhs_size_bytes); + const size_t width = pack_type == RhsPackType::KxN ? N : K; + const size_t height = pack_type == RhsPackType::KxN ? K : N; kai_datatype scale_dt = kai_datatype::kai_dt_bf16; + const size_t rhs_stride = round_up_multiple(width, 2); + const size_t rhs_stride_bytes = round_up_division(width, 2); + const size_t scales_stride_bytes = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + + KAI_ASSUME(rhs_values_qsi4.size() == round_up_division(height * rhs_stride, 2)); + + const auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_values_qsi4.size() * 2); + auto rhs_qsu4 = + pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * 2, rhs_values_qsi4.size()); + size_t scale_offset = rect_start_row * scales_stride_bytes; size_t imp_packed_rhs_size_neon, rhs_packed_offset_neon, rhs_offset_neon; @@ -263,8 +251,8 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { // * Performs GEMM. auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - auto [ref_rhs_values_qsi4, ref_rhs_scales, rhs_stride_bytes, rhs_size_bytes, rhs_scales_stride_bytes] = - ref_quant_pack_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type); + auto [ref_rhs_values_qsi4, ref_rhs_scales] = + ref_quant_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type == RhsPackType::NxK); Buffer ref_dst_noclamp; if (rhs_pack_type == RhsPackType::NxK) { @@ -282,8 +270,8 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { } // Clamps the reference output. - auto clamp_ratio = 0.8F; - auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); + const auto clamp_ratio = 0.8F; + const auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); auto ref_dst = clamp(ref_dst_noclamp.data(), M * N, clamp_min, clamp_max); // Runs the LHS packing micro-kernel. @@ -291,7 +279,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); Buffer imp_packed_lhs(imp_packed_lhs_size); - auto lhs_stride = K * sizeof(float); + const auto lhs_stride = K * sizeof(float); auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); @@ -305,9 +293,9 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { const auto rhs_start_row = rect.start_col(); size_t bias_offset = rhs_start_row * sizeof(float); - auto [imp_packed_rhs, rhs_packed_offset] = imp_pack_rhs_qsi4c32pscalebf16( - N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, rhs_stride_bytes, rhs_size_bytes, ref_biases, bias_offset, - ref_rhs_scales, rhs_scales_stride_bytes, rhs_pack_type, rhs_start_row, rect.width()); + auto [imp_packed_rhs, rhs_packed_offset] = pack_rhs_qsi4c32pscalebf16( + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, ref_biases, bias_offset, ref_rhs_scales, rhs_pack_type, + rhs_start_row, rect.width()); auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); @@ -340,8 +328,8 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { // Test vectorized packing functions, if packing parameters allow if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32pscalebf16_neon( - N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, rhs_stride_bytes, rhs_size_bytes, ref_biases, bias_offset, - ref_rhs_scales, rhs_scales_stride_bytes, rhs_pack_type, rhs_start_row, rect.width()); + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, ref_biases, bias_offset, ref_rhs_scales, rhs_pack_type, + rhs_start_row, rect.width()); ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); ukernel_variant.interface.run_matmul( -- GitLab From 37398b13616598c91777e5fd8d2c9c0f7479ac38 Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Fri, 18 Jul 2025 16:57:21 +0100 Subject: [PATCH 5/6] remove ref_ and imp_ prefixes from names where not required, add documentation to new reference function Signed-off-by: Evie Wright --- test/reference/quantize.cpp | 8 ++--- test/reference/quantize.hpp | 19 +++++++++-- ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 33 +++++++++---------- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index ed59a2e5..301fef9b 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -299,10 +299,10 @@ template std::tuple quantize_asymmetric_per_block_dynami // * Generates signed values for reference matmul // * Generates reference scales from input RHS matrix template -inline std::tuple ref_quant_rhs_qsi4c32p( - size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed) { +inline std::tuple quantize_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& rhs, bool transposed) { auto [rhs_values_qsi4, rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); + quantize_symmetric_per_block_dynamic(rhs.data(), N, K, bl); const size_t width = transposed ? K : N; const size_t height = transposed ? N : K; @@ -316,6 +316,6 @@ inline std::tuple ref_quant_rhs_qsi4c32p( return {std::move(rhs_values_qsi4), std::move(rhs_scales)}; } -template std::tuple ref_quant_rhs_qsi4c32p( +template std::tuple quantize_rhs_qsi4c32p( size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed); } // namespace kai::test diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index 6e0ecb03..1dc70833 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -359,7 +359,22 @@ std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); /// Reference quantization template for rhs packing, Int4 per-block. +/// +/// Quantized values and scales are calculated using the +/// @ref quantize_symmetric_per_block_dynamic function. +/// If the quantized matrix needs to have KxN orientation the value matrix is then transposed using the +/// @ref transpose_with_padding function. +/// +/// @tparam SrcType The data type of the input data (must be floating-point). +/// @tparam ScaleType The data type of the quantization scales (must be floating-point). +/// +/// @param[in] N The number of input rows. +/// @param[in] K The number of input columns. +/// @param[in] transposed Whether the quantized matrix should be transposed (NxK orientation) or non-transposed (KxN +/// orientation) for an output matrix of MxN. +/// @param[in] rhs The matrix to be quantized +/// +/// @return The quantized data matrix and the quantization scale matrix. template -std::tuple ref_quant_rhs_qsi4c32p( - size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed); +std::tuple quantize_rhs_qsi4c32p(size_t N, size_t K, size_t bl, const Buffer& rhs, bool transposed); } // namespace kai::test diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 64e648e2..f753b0b8 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -84,9 +84,8 @@ static const std::array pack_rhs_qsi4c32pscalebf16( - size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, - const Buffer& ref_biases, size_t bias_offset, const Buffer& ref_scales, RhsPackType pack_type, - size_t rect_start_row, size_t rect_width) { + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, const Buffer& biases, + size_t bias_offset, const Buffer& rhs_scales, RhsPackType pack_type, size_t rect_start_row, size_t rect_width) { const size_t width = pack_type == RhsPackType::KxN ? N : K; const size_t height = pack_type == RhsPackType::KxN ? K : N; kai_datatype scale_dt = kai_datatype::kai_dt_bf16; @@ -124,9 +123,8 @@ static inline std::tuple pack_rhs_qsi4c32pscalebf16( kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), - rhs_stride_bytes, reinterpret_cast(ref_biases.data() + bias_offset), - ref_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, - ¶ms); + rhs_stride_bytes, reinterpret_cast(biases.data() + bias_offset), rhs_scales.data() + scale_offset, + scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); } else { kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; @@ -135,18 +133,17 @@ static inline std::tuple pack_rhs_qsi4c32pscalebf16( kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), - rhs_stride_bytes, reinterpret_cast(ref_biases.data() + bias_offset), - ref_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, + rhs_stride_bytes, reinterpret_cast(biases.data() + bias_offset), + rhs_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); } return {std::move(imp_packed_rhs), rhs_packed_offset}; } // Executes the vectorized RHS packing micro-kernels for block length of 4 bytes or 8 bytes -static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16_neon( - size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, - const Buffer& ref_biases, size_t bias_offset, const Buffer& ref_scales, RhsPackType pack_type, - size_t rect_start_row, size_t rect_width) { +static inline std::tuple pack_rhs_qsi4c32pscalebf16_neon( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, const Buffer& biases, + size_t bias_offset, const Buffer& rhs_scales, RhsPackType pack_type, size_t rect_start_row, size_t rect_width) { KAI_ASSUME(kr / sr == 8 || kr / sr == 4); const size_t width = pack_type == RhsPackType::KxN ? N : K; const size_t height = pack_type == RhsPackType::KxN ? K : N; @@ -191,15 +188,15 @@ static inline std::tuple imp_pack_rhs_qsi4c32pscalebf16_neon( kai_run_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( 1, rect_width /* n */, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_scales.data() + scale_offset), scales_stride_bytes, + reinterpret_cast(biases.data() + bias_offset), + reinterpret_cast(rhs_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); } else { kai_run_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( 1, rect_width /* n */, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_scales.data() + scale_offset), scales_stride_bytes, + reinterpret_cast(biases.data() + bias_offset), + reinterpret_cast(rhs_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); } return {std::move(imp_packed_rhs_neon), rhs_packed_offset_neon}; @@ -252,7 +249,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); auto [ref_rhs_values_qsi4, ref_rhs_scales] = - ref_quant_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type == RhsPackType::NxK); + quantize_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type == RhsPackType::NxK); Buffer ref_dst_noclamp; if (rhs_pack_type == RhsPackType::NxK) { @@ -327,7 +324,7 @@ TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { // Test vectorized packing functions, if packing parameters allow if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { - const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = imp_pack_rhs_qsi4c32pscalebf16_neon( + const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = pack_rhs_qsi4c32pscalebf16_neon( N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, ref_biases, bias_offset, ref_rhs_scales, rhs_pack_type, rhs_start_row, rect.width()); ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); -- GitLab From 48e7ac497d07dc6d5d5a1065b8fbbe741bfd31f0 Mon Sep 17 00:00:00 2001 From: Evie Wright Date: Mon, 21 Jul 2025 09:36:34 +0100 Subject: [PATCH 6/6] correct doxygen comment in in quantize.hpp Signed-off-by: Evie Wright --- test/reference/quantize.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index 1dc70833..cab79ffe 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -370,9 +370,10 @@ std::tuple quantize_asymmetric_per_block_dynamic( /// /// @param[in] N The number of input rows. /// @param[in] K The number of input columns. +/// @param[in] bl The block length for quantization. +/// @param[in] rhs The matrix to be quantized. /// @param[in] transposed Whether the quantized matrix should be transposed (NxK orientation) or non-transposed (KxN /// orientation) for an output matrix of MxN. -/// @param[in] rhs The matrix to be quantized /// /// @return The quantized data matrix and the quantization scale matrix. template -- GitLab