From 77ae9e28ccfc7e9565e8e7f65388e09656377df6 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 00:16:07 +0000 Subject: [PATCH 1/9] Improve Int4 Matmul Micro-kernels test coverage by adding portion testing Signed-off-by: Anitha Raj --- ...matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 127 ++++++++++++++---- 1 file changed, 104 insertions(+), 23 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 6519d028..5b723aa2 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -34,6 +34,7 @@ #include "test/common/bfloat16.hpp" #include "test/common/cpu_info.hpp" #include "test/common/int4.hpp" +#include "test/common/matrix_portion.hpp" #include "test/common/memory.hpp" #include "test/common/round.hpp" #include "test/common/test_suite.hpp" @@ -66,7 +67,7 @@ static const std::array; +using MatMulTestParams_withBL = std::tuple; class UkernelVariantTest_withBL : public ::testing::TestWithParam {}; @@ -99,7 +100,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_LHS) { } TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { - const auto& [variant_index, matmul_shape, bl] = GetParam(); + const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { @@ -121,6 +122,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { const auto ref_lhs = fill_random(M * K, seed + 0); const auto ref_rhs = fill_random(N * K, seed + 1); const auto ref_biases = fill_random(N, seed + 2); + kai_datatype scale_dt = kai_datatype::kai_dt_bf16; // Runs the reference implementation. // * Quantizes the LHS matrix using 8-bit asymmetric quantization. @@ -136,11 +138,31 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); // Runs the RHS packing micro-kernel. // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. @@ -153,8 +175,13 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { const size_t ref_rhs_scales_stride = round_up_division(K, bl) * sizeof(uint16_t); const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, kai_datatype::kai_dt_bf16); + kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); std::vector imp_packed_rhs(imp_packed_rhs_size); + + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(packed_rhs_start_row, K, nr, kr, sr, bl, scale_dt); + constexpr kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{ .lhs_zero_point = 1, .rhs_zero_point = 8, .scale_dt = kai_datatype::kai_dt_bf16}; kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( @@ -162,20 +189,30 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), ref_rhs_scales_stride, imp_packed_rhs.data(), 0, ¶ms); - // Runs the GEMM micro-kernel. + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); + const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); + + // Runs the GEMM micro-kernel. std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, bl, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_packed_offset, + imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + // Compares the output of the micro-kernels against the output of the reference implementation for the portion + // tested. + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -185,7 +222,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { } TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { - const auto& [variant_index, matmul_shape, bl] = GetParam(); + const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { @@ -207,6 +244,7 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const auto ref_lhs = fill_random(M * K, seed + 0); const auto ref_rhs_transposed = fill_random(N * K, seed + 1); const auto ref_biases = fill_random(N, seed + 2); + kai_datatype scale_dt = kai_datatype::kai_dt_bf16; // Transposed(nxk) RHS dimensions const size_t ref_rhs_qsi4_nxk_stride = K; @@ -234,11 +272,31 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); // Runs the RHS packing micro-kernel. // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. @@ -249,8 +307,12 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const size_t ref_rhs_qsu4_stride = round_up_division(N, 2); const size_t ref_rhs_scales_stride = round_up_division(K, bl) * sizeof(uint16_t); + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(packed_rhs_start_row, K, nr, kr, sr, bl, scale_dt); + const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, kai_datatype::kai_dt_bf16); + kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); std::vector imp_packed_rhs(imp_packed_rhs_size); constexpr kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{ .lhs_zero_point = 1, .rhs_zero_point = 8, .scale_dt = kai_datatype::kai_dt_bf16}; @@ -259,20 +321,28 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { reinterpret_cast(ref_biases.data()), ref_rhs_scales.data(), ref_rhs_scales_stride, imp_packed_rhs.data(), 0, ¶ms); + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); + // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, bl, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_packed_offset, + imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -291,15 +361,26 @@ INSTANTIATE_TEST_SUITE_P( MatMulShape{17, 25, 64}, // MatMulShape{15, 31, 128}, // MatMulShape{1, 25, 64}), - testing::Values(32, 64)), + testing::Values(32, 64), + testing::Values( + MatrixPortion(0, 0, 1, 1), // Full matrix. + MatrixPortion(0, 0, 1, 0.25), // Leftmost portion. + MatrixPortion(0, 0.75, 1, 1), // Rightmost portion. + MatrixPortion(0, 0.5, 1, 0.8) // Somewhere Middle + )), [](const auto& info) { const auto variant_idx = std::get<0>(info.param); const std::string name{variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_idx).name}; const auto shape = std::get(info.param); const auto bl = std::get<2>(info.param); + const auto portion = std::get<3>(info.param); std::stringstream sstream; - sstream << name << "__M_" << shape.m << "__N_" << shape.n << "__K_" << shape.k << "__BL_" << bl; + sstream << name << "__M_" << shape.m << "__N_" << shape.n << "__K_" << shape.k << "__BL_" << bl + << "__PortionStartRow_" << static_cast(portion.start_row() * 1000) // + << "__PortionStartCol_" << static_cast(portion.start_col() * 1000) // + << "__PortionHeight_" << static_cast(portion.height() * 1000) // + << "__PortionWidth_" << static_cast(portion.width() * 1000); return sstream.str(); }); -- GitLab From 1063f254607dceb8fea6753b548aed193f4739a3 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 08:43:45 +0000 Subject: [PATCH 2/9] Add portion tests to per-channel int4 Signed-off-by: Anitha Raj --- ...2_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c | 2 +- .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 175 +++++++++++++++--- 2 files changed, 147 insertions(+), 30 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c index 9b27c742..ae71f09c 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c @@ -47,7 +47,7 @@ inline static size_t kai_rhs_packed_stride(size_t k) { KAI_ASSERT((k_internal % 2) == 0); - return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs); + return kai_nr * ((k_internal / 2) + kai_num_bytes_multiplier_rhs + kai_num_bytes_sum_rhs + kai_num_bytes_bias); } size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void) { diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 3a5d0b58..9811692f 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -35,6 +35,7 @@ #include "kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxps1s0_qsu4cxs1s0_neon.h" #include "test/common/cpu_info.hpp" #include "test/common/int4.hpp" +#include "test/common/matrix_portion.hpp" #include "test/common/memory.hpp" #include "test/common/round.hpp" #include "test/common/test_suite.hpp" @@ -46,7 +47,10 @@ #include "test/reference/transpose.hpp" namespace kai::test { +/// Matrix multiplication test information. +using MatMulTestParams_with_portion = std::tuple; +class UkernelVariantTest_with_portions : public ::testing::TestWithParam {}; enum class RhsPackType { NxK, KxN }; using ukernel_rhs_pack_function = std::function; @@ -169,10 +173,10 @@ static const std::array::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); // Runs the RHS packing micro-kernel. // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. @@ -228,6 +253,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { ref_rhs_qsi4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 0}; @@ -235,20 +264,29 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { 1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); + // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, + imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + // Compares the output of the micro-kernels against the output of the reference implementation for the portion + // tested. + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -256,8 +294,9 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { } } } + TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { - const auto& [variant_index, matmul_shape] = GetParam(); + const auto& [variant_index, matmul_shape, portion] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4cxp.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { @@ -297,11 +336,32 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), N * K); // Runs the RHS packing micro-kernel. @@ -311,6 +371,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 8}; @@ -318,6 +382,10 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { 1, N, K, nr, kr, sr, ref_rhs_qsu4_padded.data(), reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); @@ -418,15 +486,19 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, + imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + // Compares the output of the micro-kernels against the output of the reference implementation for the portion + // tested. + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -436,7 +508,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { } TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { - const auto& [variant_index, matmul_shape] = GetParam(); + const auto& [variant_index, matmul_shape, portion] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4cxp.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { @@ -488,11 +560,32 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); // Runs the RHS packing micro-kernel. // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. @@ -501,26 +594,39 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { const auto ref_rhs_qsu4_padded = pad_row( ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); + std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 8}; ukernel_variant.run_rhs_pack( 1, N, K, nr, kr, sr, ref_rhs_qsu4_padded.data(), reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); + // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); std::vector imp_dst(imp_dst_size); ukernel_variant.interface.run_matmul( - M, N, K, imp_packed_lhs.data(), imp_packed_rhs.data(), reinterpret_cast(imp_dst.data()), + rect.height(), rect.width(), K, imp_packed_lhs.data() + lhs_packed_offset, + imp_packed_rhs.data() + rhs_packed_offset, reinterpret_cast(imp_dst.data() + dst_offset), N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); // Compares the output of the micro-kernels against the output of the reference implementation. - for (size_t y = 0; y < M; ++y) { - for (size_t x = 0; x < N; ++x) { - const auto imp_value = read_array(imp_dst.data(), y * N + x); - const auto ref_value = read_array(ref_dst.data(), y * N + x); - const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : std::abs(imp_value); + for (size_t y = 0; y < rect.height(); ++y) { + for (size_t x = 0; x < rect.width(); ++x) { + const auto imp_value = + read_array(imp_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto ref_value = + read_array(ref_dst.data(), (rect.start_row() + y) * N + (x + rect.start_col())); + const auto rel_error = ref_value != 0 ? std::abs((imp_value - ref_value) / ref_value) : imp_value; if (rel_error > 0.0001F) { ASSERT_EQ(imp_value, ref_value); @@ -539,14 +645,25 @@ INSTANTIATE_TEST_SUITE_P( MatMulShape{15, 35, 65}, // MatMulShape{8, 32, 64}, // MatMulShape{15, 31, 45}, // - MatMulShape{1, 35, 65})), + MatMulShape{1, 35, 65}), + testing::Values( + MatrixPortion(0, 0, 1, 1), // Full matrix. + MatrixPortion(0, 0, 1, 0.25), // Leftmost portion. + MatrixPortion(0, 0.75, 1, 1), // Rightmost portion. + MatrixPortion(0, 0.5, 1, 0.8) // Somewhere Middle + )), [](const auto& info) { const auto variant_idx = std::get<0>(info.param); const std::string name{variants_kai_matmul_clamp_f32_qai8dxp_qsi4cxp.at(variant_idx).name}; const auto shape = std::get(info.param); + const auto portion = std::get<2>(info.param); std::stringstream sstream; - sstream << name << "__M_" << shape.m << "__N_" << shape.n << "__K_" << shape.k; + sstream << name << "__M_" << shape.m << "__N_" << shape.n << "__K_" << shape.k << "__PortionStartRow_" + << static_cast(portion.start_row() * 1000) // + << "__PortionStartCol_" << static_cast(portion.start_col() * 1000) // + << "__PortionHeight_" << static_cast(portion.height() * 1000) // + << "__PortionWidth_" << static_cast(portion.width() * 1000); return sstream.str(); }); -- GitLab From 3954b09007dcfcbc2c47c7ca3367caf516e5cb20 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 10:39:28 +0000 Subject: [PATCH 3/9] Fix kxn kernel name Signed-off-by: Anitha Raj --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 9811692f..944d1f41 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -596,7 +596,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); -- GitLab From 20dff3580c3aebd167c218aa40f5de0dca51cc45 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 15:20:04 +0000 Subject: [PATCH 4/9] Update per-channel unit tests after rebase Signed-off-by: Anitha Raj --- .../matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 87 +++++++++++++------ 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 944d1f41..b593c002 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -55,11 +55,13 @@ enum class RhsPackType { NxK, KxN }; using ukernel_rhs_pack_function = std::function; using ukernel_get_rhs_packed_size = std::function; +using ukernel_get_rhs_packed_offset = std::function; template struct UkernelVariantCustom : public UkernelVariant { ukernel_rhs_pack_function run_rhs_pack; ukernel_get_rhs_packed_size get_rhs_packed_size; + ukernel_get_rhs_packed_offset get_rhs_packed_offset; RhsPackType rhs_pack_type; bool signed_integer_support; @@ -68,10 +70,12 @@ struct UkernelVariantCustom : public UkernelVariant { UkernelVariantCustom( T interface, std::string_view name, const std::function& fn_is_supported, ukernel_rhs_pack_function run_rhs_pack, ukernel_get_rhs_packed_size get_rhs_packed_size, - const RhsPackType pack_type, const bool signed_integer_support) : + ukernel_get_rhs_packed_offset get_rhs_packed_offset, const RhsPackType pack_type, + const bool signed_integer_support) : UkernelVariant(interface, name, fn_is_supported), run_rhs_pack(std::move(run_rhs_pack)), get_rhs_packed_size(std::move(get_rhs_packed_size)), + get_rhs_packed_offset(std::move(get_rhs_packed_offset)), rhs_pack_type(pack_type), signed_integer_support(signed_integer_support) { } @@ -82,12 +86,14 @@ static const std::array imp_packed_rhs(imp_packed_rhs_size); @@ -372,8 +378,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); @@ -409,7 +414,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { } TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { - const auto& [variant_index, matmul_shape] = GetParam(); + const auto& [variant_index, matmul_shape, portion] = GetParam(); const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4cxp.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { @@ -463,11 +468,31 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { ref_rhs_scales.data(), nullptr, K, ref_biases.data(), std::numeric_limits::lowest(), std::numeric_limits::max()); + auto m_step = ukernel_variant.interface.get_m_step(); + ASSERT_TRUE(m_step % mr == 0); + + auto n_step = ukernel_variant.interface.get_n_step(); + ASSERT_TRUE(n_step % nr == 0); + + const auto rect = portion.compute_portion(M, N, m_step, n_step); + if (rect.height() == 0 || rect.width() == 0) { + GTEST_SKIP(); + } + + const auto lhs_start_row = rect.start_row(); + size_t lhs_stride = K * sizeof(float); + // Runs the LHS packing micro-kernel. const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); std::vector imp_packed_lhs(imp_packed_lhs_size); + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); + auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); + ASSERT_EQ(lhs_packed_offset, ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K)); + kai_run_lhs_quant_pack_qai8dxp_f32( - M, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data()), K * sizeof(float), imp_packed_lhs.data()); + rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, + reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, + imp_packed_lhs.data() + lhs_packed_offset); // Runs the RHS packing micro-kernel. // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. @@ -475,12 +500,21 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsi4cx) { const auto ref_rhs_qsi4_padded = pad_row( ref_rhs_qsi4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); + const auto packed_rhs_start_row = rect.start_col(); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); + ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); + std::vector imp_packed_rhs(imp_packed_rhs_size); const kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0_params params{.lhs_zero_point = 1, .rhs_zero_point = 0}; ukernel_variant.run_rhs_pack( 1, N, K, nr, kr, sr, ref_rhs_qsi4_padded.data(), reinterpret_cast(ref_biases.data()), reinterpret_cast(ref_rhs_scales.data()), imp_packed_rhs.data(), 0, ¶ms); + const auto dst_stride = N * sizeof(float); + const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); + const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + ASSERT_EQ(dst_offset, ref_dst_offset); + // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); @@ -595,8 +629,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); -- GitLab From 236e93b5a0bedcb4cef72222a25af6798d066bbf Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 16:26:14 +0000 Subject: [PATCH 5/9] Update RHS offset testing with matmul RHS packed offset comparisons Signed-off-by: Anitha Raj --- .../matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index 5b723aa2..403b85eb 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -74,20 +74,34 @@ class UkernelVariantTest_withBL : public ::testing::TestWithParam Date: Tue, 24 Dec 2024 16:12:13 +0000 Subject: [PATCH 6/9] Add 1x4 GEMV dotprod microkernels - Add unit tests for new microkernels Signed-off-by: Michael Kozlov Reviewed-by: Anitha Raj Reviewed-by: Anton Bondarenko Approved-by: Jakub Sujak --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index b593c002..e0632949 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -104,6 +104,15 @@ static const std::array Date: Tue, 24 Dec 2024 08:43:45 +0000 Subject: [PATCH 7/9] Add portion tests to per-channel int4 Signed-off-by: Anitha Raj --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index e0632949..0eb311fa 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -270,7 +270,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); @@ -387,7 +388,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); @@ -638,7 +640,8 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_kxn_qsu4cx) { ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); -- GitLab From d94f3614455f377315372f1352089f4605581f2b Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 15:20:04 +0000 Subject: [PATCH 8/9] Update per-channel unit tests after rebase Signed-off-by: Anitha Raj --- test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 0eb311fa..567c441a 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -270,8 +270,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsi4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); @@ -388,8 +387,7 @@ TEST_P(MatMulTest_f32_qai8dxp_qsi4cxp, EndToEnd_RHS_nxk_qsu4cx) { const auto imp_packed_rhs_size = ukernel_variant.get_rhs_packed_size(N, K, nr, kr, sr); const auto packed_rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qs4cxs1s0(packed_rhs_start_row, K, nr, kr, sr); + auto rhs_packed_offset = ukernel_variant.get_rhs_packed_offset(packed_rhs_start_row, K, nr, kr, sr); ASSERT_EQ(rhs_packed_offset, ukernel_variant.interface.get_rhs_packed_offset(packed_rhs_start_row, K)); std::vector imp_packed_rhs(imp_packed_rhs_size); -- GitLab From 3dc35603a9a65167e6f9f86dd8b597000ca45a04 Mon Sep 17 00:00:00 2001 From: Anitha Raj Date: Tue, 24 Dec 2024 16:33:24 +0000 Subject: [PATCH 9/9] Updates to new tests after rebase Signed-off-by: Anitha Raj --- .../tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp index 567c441a..402af683 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4cxp_test.cpp @@ -98,20 +98,11 @@ static const std::array