diff --git a/test/reference/matmul.cpp b/test/reference/matmul.cpp index 81613212d6e3bb88e660e4524b6b3f74515dd3c0..9eba8b9547be9e272014fcfe39de7e15ff89e347 100644 --- a/test/reference/matmul.cpp +++ b/test/reference/matmul.cpp @@ -378,6 +378,87 @@ template Buffer matmul_nt_t_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + +template < + typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, + typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> +Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, // + size_t lhs_quant_height, size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, // + size_t rhs_quant_height, size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, // + size_t bias_quant_width) { + const auto lhs_num_quant_per_row = round_up_division(k, lhs_quant_width); + const auto rhs_num_quant_per_row = round_up_division(k, rhs_quant_width); + + Buffer dst(m * n * sizeof(DstData)); + + for (size_t row = 0; row < m; ++row) { + for (size_t col = 0; col < n; ++col) { + DstData acc = 0; + + for (size_t i = 0; i < k; ++i) { + const auto lhs_data_index = row * k + i; + const auto lhs_quant_index = (row / lhs_quant_height) * lhs_num_quant_per_row + i / lhs_quant_width; + const auto lhs_value = read_array(lhs_data, lhs_data_index); + const auto lhs_scale = lhs_scales != nullptr ? read_array(lhs_scales, lhs_quant_index) + : static_cast(1); + const auto lhs_zero_point = lhs_zero_points != nullptr + ? read_array(lhs_zero_points, lhs_quant_index) + : static_cast(0); + + const auto rhs_data_index = col + i * n; + const auto rhs_quant_index = (col / rhs_quant_height) * rhs_num_quant_per_row + i / rhs_quant_width; + const auto rhs_value = read_array(rhs_data, rhs_data_index); + const auto rhs_scale = rhs_scales != nullptr ? read_array(rhs_scales, rhs_quant_index) + : static_cast(1); + const auto rhs_zero_point = rhs_zero_points != nullptr + ? read_array(rhs_zero_points, rhs_quant_index) + : static_cast(0); + + acc += (static_cast(lhs_value) - static_cast(lhs_zero_point)) * + static_cast(lhs_scale) * + (static_cast(rhs_value) - static_cast(rhs_zero_point)) * + static_cast(rhs_scale); + } + + if (bias_data != nullptr) { + const auto bias_value = read_array(bias_data, col); + const auto bias_scale = bias_scales != nullptr + ? read_array(bias_scales, col / bias_quant_width) + : static_cast(1); + const auto bias_zero_point = bias_zero_points != nullptr + ? read_array(bias_zero_points, col / bias_quant_width) + : static_cast(0); + + acc += (static_cast(bias_value) - static_cast(bias_zero_point)) * + static_cast(bias_scale); + } + + write_array(dst.data(), row * n + col, acc); + } + } + + return dst; +} + +template Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + template Buffer indirect_matmul_nt_t_quantized( size_t m, size_t n, size_t k_chunk_count, size_t k_chunk_length, // diff --git a/test/reference/matmul.hpp b/test/reference/matmul.hpp index 8ef0649076adb1b5c5ecc9a0fbd4a0fcdc64300c..673b1310891131b998b21e0c66a1333a6803e87a 100644 --- a/test/reference/matmul.hpp +++ b/test/reference/matmul.hpp @@ -192,6 +192,17 @@ Buffer matmul_nt_t_quantized( size_t rhs_quant_width, // const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); +template < + typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, + typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> +Buffer matmul_nt_nt_quantized( + size_t m, size_t n, size_t k, // + const void* lhs_data, const void* lhs_scales, const void* lhs_zero_points, size_t lhs_quant_height, + size_t lhs_quant_width, // + const void* rhs_data, const void* rhs_scales, const void* rhs_zero_points, size_t rhs_quant_height, + size_t rhs_quant_width, // + const void* bias_data, const void* bias_scales, const void* bias_zero_points, size_t bias_quant_width); + template < typename LhsData, typename LhsScale, typename LhsZeroPoint, typename RhsData, typename RhsScale, typename RhsZeroPoint, typename BiasData, typename BiasScale, typename BiasZeroPoint, typename DstData> diff --git a/test/reference/quantize.cpp b/test/reference/quantize.cpp index 008f667663be2f65f060279e7714679ec96da97a..301fef9b897de143cb7cf5ad9dc6903161d243e5 100644 --- a/test/reference/quantize.cpp +++ b/test/reference/quantize.cpp @@ -20,6 +20,7 @@ #include "test/common/round.hpp" #include "test/common/type_traits.hpp" #include "test/reference/cast.hpp" +#include "test/reference/transpose.hpp" namespace kai::test { @@ -293,4 +294,28 @@ template std::tuple quantize_asymmetric_per_block_dynami const void* src, size_t height, size_t width, size_t quant_width); template std::tuple quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); + +// Reference quantization and packing => Int4 per-block. +// * Generates signed values for reference matmul +// * Generates reference scales from input RHS matrix +template +inline std::tuple quantize_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& rhs, bool transposed) { + auto [rhs_values_qsi4, rhs_scales] = + quantize_symmetric_per_block_dynamic(rhs.data(), N, K, bl); + + const size_t width = transposed ? K : N; + const size_t height = transposed ? N : K; + + const size_t qsi4_stride = round_up_multiple(width, 2); + const size_t qsi4_size_bytes = round_up_division(height * qsi4_stride, 2); + + if (!transposed) { + rhs_values_qsi4 = transpose_with_padding(rhs_values_qsi4.data(), N, K, K, qsi4_stride, qsi4_size_bytes); + } + + return {std::move(rhs_values_qsi4), std::move(rhs_scales)}; +} +template std::tuple quantize_rhs_qsi4c32p( + size_t N, size_t K, size_t bl, const Buffer& ref_rhs, bool transposed); } // namespace kai::test diff --git a/test/reference/quantize.hpp b/test/reference/quantize.hpp index d0aa3bcd838102199ad43700dbf0f93ed2d85ada..cab79ffeeca8d0cea1406fd2087de486825ad556 100644 --- a/test/reference/quantize.hpp +++ b/test/reference/quantize.hpp @@ -358,4 +358,24 @@ template quantize_asymmetric_per_block_dynamic( const void* src, size_t height, size_t width, size_t quant_width); +/// Reference quantization template for rhs packing, Int4 per-block. +/// +/// Quantized values and scales are calculated using the +/// @ref quantize_symmetric_per_block_dynamic function. +/// If the quantized matrix needs to have KxN orientation the value matrix is then transposed using the +/// @ref transpose_with_padding function. +/// +/// @tparam SrcType The data type of the input data (must be floating-point). +/// @tparam ScaleType The data type of the quantization scales (must be floating-point). +/// +/// @param[in] N The number of input rows. +/// @param[in] K The number of input columns. +/// @param[in] bl The block length for quantization. +/// @param[in] rhs The matrix to be quantized. +/// @param[in] transposed Whether the quantized matrix should be transposed (NxK orientation) or non-transposed (KxN +/// orientation) for an output matrix of MxN. +/// +/// @return The quantized data matrix and the quantization scale matrix. +template +std::tuple quantize_rhs_qsi4c32p(size_t N, size_t K, size_t bl, const Buffer& rhs, bool transposed); } // namespace kai::test diff --git a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp index be8e2b717e057081b7bfcd1d92f0defdb2db66a8..f753b0b83fc59a0f6ebb30675368d3205f8d2224 100644 --- a/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp +++ b/test/tests/matmul_clamp_f32_qai8dxp_qsi4c32p_test.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -47,6 +46,7 @@ #include "test/common/round.hpp" #include "test/common/test_suite.hpp" #include "test/reference/cast.hpp" +#include "test/reference/clamp.hpp" #include "test/reference/fill.hpp" #include "test/reference/matmul.hpp" #include "test/reference/pad.hpp" @@ -54,6 +54,9 @@ #include "test/reference/transpose.hpp" namespace kai::test { + +enum class RhsPackType { NxK, KxN }; + static const std::array, 11> variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p = { {{UKERNEL_MATMUL_VARIANT(clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod), @@ -79,273 +82,134 @@ static const std::array; - -class MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p : public ::testing::TestWithParam {}; - -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_RHS) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); - - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; - } - - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - auto m_step = ukernel_variant.interface.get_m_step(); - auto n_step = ukernel_variant.interface.get_n_step(); - - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; - } - - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); +// Executes the scalar RHS packing micro-kernel. +static inline std::tuple pack_rhs_qsi4c32pscalebf16( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, const Buffer& biases, + size_t bias_offset, const Buffer& rhs_scales, RhsPackType pack_type, size_t rect_start_row, size_t rect_width) { + const size_t width = pack_type == RhsPackType::KxN ? N : K; + const size_t height = pack_type == RhsPackType::KxN ? K : N; kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - const auto rhs_start_row = rect.start_col(); - auto rhs_packed_offset_kxn = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_packed_offset_nxk = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - - ASSERT_EQ(rhs_packed_offset_kxn, rhs_packed_offset_nxk); - - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset_kxn, rhs_matmul_offset); -} - -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, Offset_LHS) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); - - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; + const size_t rhs_stride = round_up_multiple(width, 2); + const size_t rhs_stride_bytes = round_up_division(width, 2); + const size_t scales_stride_bytes = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + + KAI_ASSUME(rhs_values_qsi4.size() == round_up_division(height * rhs_stride, 2)); + + const auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_values_qsi4.size() * 2); + auto rhs_qsu4 = + pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * 2, rhs_values_qsi4.size()); + + const size_t scale_offset = rect_start_row * scales_stride_bytes; + size_t rhs_offset, rhs_packed_offset, imp_packed_rhs_size; + if (pack_type == RhsPackType::KxN) { + rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rect_start_row, rhs_stride_bytes); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rect_start_row, K, nr, kr, sr, bl, scale_dt); + imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); + } else { + rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rect_start_row, rhs_stride_bytes); + rhs_packed_offset = + kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rect_start_row, K, nr, kr, sr, bl, scale_dt); + imp_packed_rhs_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); } - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - auto m_step = ukernel_variant.interface.get_m_step(); - auto n_step = ukernel_variant.interface.get_n_step(); - - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; + Buffer imp_packed_rhs(imp_packed_rhs_size); + if (pack_type == RhsPackType::KxN) { + kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; + params.lhs_zero_point = 1; + params.rhs_zero_point = 8; + params.scale_dt = scale_dt; + + kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), + rhs_stride_bytes, reinterpret_cast(biases.data() + bias_offset), rhs_scales.data() + scale_offset, + scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); + } else { + kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; + params.lhs_zero_point = 1; + params.rhs_zero_point = 8; + params.scale_dt = scale_dt; + + kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( + 1, rect_width, K, nr, kr, sr, bl, reinterpret_cast(rhs_qsu4.data() + rhs_offset), + rhs_stride_bytes, reinterpret_cast(biases.data() + bias_offset), + rhs_scales.data() + scale_offset, scales_stride_bytes, imp_packed_rhs.data() + rhs_packed_offset, 0, + ¶ms); } - - const auto mr = ukernel_variant.interface.get_mr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - - const auto lhs_start_row = rect.start_row(); - auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); - - ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); + return {std::move(imp_packed_rhs), rhs_packed_offset}; } -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_nxk) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); - - if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { - GTEST_SKIP() << "Unsupported CPU feature"; - } - - constexpr uint32_t seed = 0; - - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - const auto mr = ukernel_variant.interface.get_mr(); - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - - // Generates input data. - const auto ref_lhs = fill_random(M * K, seed + 0); - const auto ref_rhs = fill_random(N * K, seed + 1); - const auto ref_biases = fill_random(N, seed + 2); +// Executes the vectorized RHS packing micro-kernels for block length of 4 bytes or 8 bytes +static inline std::tuple pack_rhs_qsi4c32pscalebf16_neon( + size_t N, size_t K, size_t bl, size_t nr, size_t kr, size_t sr, const Buffer& rhs_values_qsi4, const Buffer& biases, + size_t bias_offset, const Buffer& rhs_scales, RhsPackType pack_type, size_t rect_start_row, size_t rect_width) { + KAI_ASSUME(kr / sr == 8 || kr / sr == 4); + const size_t width = pack_type == RhsPackType::KxN ? N : K; + const size_t height = pack_type == RhsPackType::KxN ? K : N; kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - // Runs the reference implementation. - // * Quantizes the LHS matrix using 8-bit asymmetric quantization. - // * Quantizes the RHS matrix using 4-bit symmetric quantization. - // * Performs GEMM. - const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = - quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - const auto [ref_rhs_qsi4, ref_rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs.data(), N, K, bl); + const size_t rhs_stride = round_up_multiple(width, 2); + const size_t rhs_stride_bytes = round_up_division(width, 2); + const size_t scales_stride_bytes = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - const auto ref_dst = matmul_clamp_nt_t( - M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), K, ref_rhs_qsi4.data(), - ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), - std::numeric_limits::max()); + KAI_ASSUME(rhs_values_qsi4.size() == round_up_division(height * rhs_stride, 2)); - auto m_step = ukernel_variant.interface.get_m_step(); - ASSERT_TRUE(m_step % mr == 0); + const auto rhs_values_qsu4 = cast_qsu4_qsi4(rhs_values_qsi4.data(), rhs_values_qsi4.size() * 2); + auto rhs_qsu4 = + pad_row(rhs_values_qsu4.data(), height, width, width, rhs_stride_bytes * 2, rhs_values_qsi4.size()); - auto n_step = ukernel_variant.interface.get_n_step(); - ASSERT_TRUE(n_step % nr == 0); + size_t scale_offset = rect_start_row * scales_stride_bytes; - const auto rect = portion.compute_portion(M, N, m_step, n_step); - if (rect.height() == 0 || rect.width() == 0) { - GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; + size_t imp_packed_rhs_size_neon, rhs_packed_offset_neon, rhs_offset_neon; + if (kr / sr == 8) { + imp_packed_rhs_size_neon = + kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); + rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( + rect_start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = + kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(rect_start_row, rhs_stride_bytes); + } else { + imp_packed_rhs_size_neon = + kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); + rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( + rect_start_row, K, nr, kr, sr, bl, scale_dt); + rhs_offset_neon = + kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(rect_start_row, rhs_stride_bytes); } - const auto lhs_start_row = rect.start_row(); - size_t lhs_stride = K * sizeof(float); - - // Runs the LHS packing micro-kernel. - const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); - Buffer imp_packed_lhs(imp_packed_lhs_size); - - auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); - auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); - auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); - ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); - - kai_run_lhs_quant_pack_qai8dxp_f32( - rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, - reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, - imp_packed_lhs.data() + lhs_packed_offset); - - // Runs the RHS packing micro-kernel. - // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. - // * Packs the RHS matrix. - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), N * K); - const auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), N, K, K, round_up_multiple(K, 2), round_up_division(N * round_up_multiple(K, 2), 2)); - - const size_t ref_rhs_qsu4_stride = round_up_division(K, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); - - const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - Buffer imp_packed_rhs(imp_packed_rhs_size); - - const auto rhs_start_row = rect.start_col(); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - - auto rhs_offset = kai_get_rhs_offset_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); - size_t bias_offset = rhs_start_row * sizeof(float); - size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; - kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params params{}; params.lhs_zero_point = 1; params.rhs_zero_point = 8; - params.scale_dt = kai_datatype::kai_dt_bf16; - - kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, - imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); - - const auto dst_stride = N * sizeof(float); - const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); - const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); - ASSERT_EQ(dst_offset, ref_dst_offset); - - const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); - ASSERT_EQ(imp_dst_size, ref_dst.size()); + params.scale_dt = scale_dt; - // Runs the GEMM micro-kernel. - Buffer imp_dst(imp_dst_size); + Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); if (kr / sr == 8) { - // Test that vectorized packing kernel for nrx8 gives same output as scalar - const auto imp_packed_rhs_size_neon = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(imp_packed_rhs_size_neon, imp_packed_rhs_size); - - Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); - - auto rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( - rhs_start_row, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); - - auto rhs_offset_neon = - kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon(rhs_start_row, ref_rhs_qsu4_stride); - kai_run_rhs_pack_nxk_qsi4c32pnrx8_qsu4c32s1s0_neon( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, + 1, rect_width /* n */, K, nr, kr, sr, bl, + reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, + reinterpret_cast(biases.data() + bias_offset), + reinterpret_cast(rhs_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); - - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); - } else if (kr / sr == 4) { - // Test that vectorized packing kernel for nrx4 gives same output as scalar - const auto imp_packed_rhs_size_neon = - kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(N, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(imp_packed_rhs_size_neon, imp_packed_rhs_size); - - Buffer imp_packed_rhs_neon(imp_packed_rhs_size_neon); - - auto rhs_packed_offset_neon = kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( - rhs_start_row, K, nr, kr, sr, bl, scale_dt); - ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); - - auto rhs_offset_neon = - kai_get_rhs_offset_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon(rhs_start_row, ref_rhs_qsu4_stride); - + } else { kai_run_rhs_pack_nxk_qsi4c32pnrx4_qsu4c32s1s0_neon( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset_neon), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), - reinterpret_cast(ref_rhs_scales.data() + scale_offset), ref_rhs_scales_stride, + 1, rect_width /* n */, K, nr, kr, sr, bl, + reinterpret_cast(rhs_qsu4.data() + rhs_offset_neon), rhs_stride_bytes, + reinterpret_cast(biases.data() + bias_offset), + reinterpret_cast(rhs_scales.data() + scale_offset), scales_stride_bytes, imp_packed_rhs_neon.data() + rhs_packed_offset_neon, 0, ¶ms); - - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); - - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); } + return {std::move(imp_packed_rhs_neon), rhs_packed_offset_neon}; +} - ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); +using MatMulTestParams_withBL_withRHSPackType = std::tuple; - // Compares the output of the micro-kernels against the output of the reference implementation for the portion - // tested. - // Compares the output of the micro-kernels against the output of the reference implementation for the portion - // tested. - DefaultMismatchHandler handler(0, 0.1, 0, 0.05); - DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); - ASSERT_TRUE(success); -} +class MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p + : public ::testing::TestWithParam {}; -TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { - const auto& [variant_index, matmul_shape, bl, portion] = GetParam(); - const auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); +TEST_P(MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd) { + auto& [variant_index, matmul_shape, bl, portion, rhs_pack_type] = GetParam(); + auto& ukernel_variant = variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_index); if (ukernel_variant.fn_is_supported && !ukernel_variant.fn_is_supported()) { GTEST_SKIP() << "Unsupported CPU feature"; @@ -353,46 +217,14 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { const uint32_t seed = 0; - const size_t M = matmul_shape.m; - const size_t N = matmul_shape.n; - const size_t K = matmul_shape.k; - - const auto mr = ukernel_variant.interface.get_mr(); - const auto nr = ukernel_variant.interface.get_nr(); - const auto kr = ukernel_variant.interface.get_kr(); - const auto sr = ukernel_variant.interface.get_sr(); - - // Generates input data. - const auto ref_lhs = fill_random(M * K, seed + 0); - const auto ref_rhs_transposed = fill_random(N * K, seed + 1); - const auto ref_biases = fill_random(N, seed + 2); - kai_datatype scale_dt = kai_datatype::kai_dt_bf16; - - // Transposed(nxk) RHS dimensions - const size_t ref_rhs_qsi4_nxk_stride = K; + size_t M = matmul_shape.m; + size_t N = matmul_shape.n; + size_t K = matmul_shape.k; - // Non-Transposed(kxn) RHS dimensions - const size_t ref_rhs_qsi4_kxn_stride = round_up_multiple(N, 2); - const size_t ref_rhs_qsi4_kxn_size = K * ref_rhs_qsi4_kxn_stride; - const size_t ref_rhs_qsi4_kxn_size_bytes = round_up_division(ref_rhs_qsi4_kxn_size, 2); - - // Runs the reference implementation. - // * Quantizes the LHS matrix using 8-bit asymmetric quantization. - // * Quantizes the RHS matrix using 4-bit symmetric quantization. - // * Performs GEMM. - const auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = - quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); - const auto [ref_rhs_qsi4_transposed, ref_rhs_scales] = - quantize_symmetric_per_block_dynamic(ref_rhs_transposed.data(), N, K, bl); - - auto ref_rhs_qsi4 = transpose_with_padding( - ref_rhs_qsi4_transposed.data(), N, K, ref_rhs_qsi4_nxk_stride, ref_rhs_qsi4_kxn_stride, - ref_rhs_qsi4_kxn_size_bytes); - - const auto ref_dst = matmul_clamp_nt_nt( - M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), K, ref_rhs_qsi4.data(), - ref_rhs_scales.data(), nullptr, bl, ref_biases.data(), std::numeric_limits::lowest(), - std::numeric_limits::max()); + auto mr = ukernel_variant.interface.get_mr(); + auto nr = ukernel_variant.interface.get_nr(); + auto kr = ukernel_variant.interface.get_kr(); + auto sr = ukernel_variant.interface.get_sr(); auto m_step = ukernel_variant.interface.get_m_step(); ASSERT_TRUE(m_step % mr == 0); @@ -400,86 +232,115 @@ TEST_P(MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, EndToEnd_RHS_kxn) { auto n_step = ukernel_variant.interface.get_n_step(); ASSERT_TRUE(n_step % nr == 0); - const auto rect = portion.compute_portion(M, N, m_step, n_step); + auto rect = portion.compute_portion(M, N, m_step, n_step); if (rect.height() == 0 || rect.width() == 0) { GTEST_SKIP() << "Empty dimension of matrix(" << rect.width() << "," << rect.height() << ")"; } - const auto lhs_start_row = rect.start_row(); - size_t lhs_stride = K * sizeof(float); + // Generates input data. + const auto ref_lhs = fill_random(M * K, seed + 0); + const auto ref_rhs = fill_random(N * K, seed + 1); + const auto ref_biases = fill_random(N, seed + 2); + + // Runs the reference implementation. + // * Quantizes the LHS matrix using 8-bit symmetric quantization. + // * Quantizes the RHS matrix using 8-bit asymmetric quantization. + // * Performs GEMM. + auto [ref_lhs_qvalues, ref_lhs_scales, ref_lhs_zero_points] = + quantize_asymmetric_per_block_dynamic(ref_lhs.data(), M, K, K); + auto [ref_rhs_values_qsi4, ref_rhs_scales] = + quantize_rhs_qsi4c32p(N, K, bl, ref_rhs, rhs_pack_type == RhsPackType::NxK); + + Buffer ref_dst_noclamp; + if (rhs_pack_type == RhsPackType::NxK) { + ref_dst_noclamp = + matmul_nt_t_quantized( + M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, + ref_rhs_values_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, + 1); + } else { + ref_dst_noclamp = + matmul_nt_nt_quantized( + M, N, K, ref_lhs_qvalues.data(), ref_lhs_scales.data(), ref_lhs_zero_points.data(), 1, K, + ref_rhs_values_qsi4.data(), ref_rhs_scales.data(), nullptr, 1, bl, ref_biases.data(), nullptr, nullptr, + 1); + } + + // Clamps the reference output. + const auto clamp_ratio = 0.8F; + const auto [clamp_min, clamp_max] = find_clamp_range(ref_dst_noclamp.data(), M * N, clamp_ratio); + auto ref_dst = clamp(ref_dst_noclamp.data(), M * N, clamp_min, clamp_max); // Runs the LHS packing micro-kernel. + const auto lhs_start_row = rect.start_row(); const auto imp_packed_lhs_size = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr); Buffer imp_packed_lhs(imp_packed_lhs_size); + const auto lhs_stride = K * sizeof(float); + auto lhs_offset = kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, lhs_stride); auto lhs_packed_offset = kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(lhs_start_row, K, mr, kr, sr); auto lhs_matmul_offset = ukernel_variant.interface.get_lhs_packed_offset(lhs_start_row, K); ASSERT_EQ(lhs_packed_offset, lhs_matmul_offset); kai_run_lhs_quant_pack_qai8dxp_f32( - rect.height() /* m */, K, mr, kr, sr, 0 /* m_idx_start*/, - reinterpret_cast(ref_lhs.data() + lhs_offset), lhs_stride, - imp_packed_lhs.data() + lhs_packed_offset); - - // Runs the RHS packing micro-kernel. - // * Generates the 4-bit unsigned symmetric quantized input for the micro-kernel. - // * Packs the RHS matrix. - const auto ref_rhs_qsu4 = cast_qsu4_qsi4(ref_rhs_qsi4.data(), ref_rhs_qsi4_kxn_size); - const auto ref_rhs_qsu4_padded = pad_row( - ref_rhs_qsu4.data(), K, N, N, round_up_multiple(N, 2), round_up_division(K * round_up_multiple(N, 2), 2)); - const size_t ref_rhs_qsu4_stride = round_up_division(N, 2); - const size_t ref_rhs_scales_stride = round_up_division(K, bl) * kai_get_datatype_size_in_bytes(scale_dt); + rect.height() /* m */, K, mr, kr, sr, 0, reinterpret_cast(ref_lhs.data() + lhs_offset), + lhs_stride, reinterpret_cast(imp_packed_lhs.data()) + lhs_packed_offset); const auto rhs_start_row = rect.start_col(); - auto rhs_offset = kai_get_rhs_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, ref_rhs_qsu4_stride); - auto rhs_packed_offset = - kai_get_rhs_packed_offset_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(rhs_start_row, K, nr, kr, sr, bl, scale_dt); - auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); - ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - size_t bias_offset = rhs_start_row * sizeof(float); - size_t scale_offset = rhs_start_row * ref_rhs_scales_stride; - const auto imp_packed_rhs_size = - kai_get_rhs_packed_size_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0(N, K, nr, kr, sr, bl, scale_dt); - Buffer imp_packed_rhs(imp_packed_rhs_size); + auto [imp_packed_rhs, rhs_packed_offset] = pack_rhs_qsi4c32pscalebf16( + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, ref_biases, bias_offset, ref_rhs_scales, rhs_pack_type, + rhs_start_row, rect.width()); - kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0_params params{}; - params.lhs_zero_point = 1; - params.rhs_zero_point = 8; - params.scale_dt = kai_datatype::kai_dt_bf16; - - kai_run_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0( - 1, rect.width() /* n */, K, nr, kr, sr, bl, - reinterpret_cast(ref_rhs_qsu4_padded.data() + rhs_offset), ref_rhs_qsu4_stride, - reinterpret_cast(ref_biases.data() + bias_offset), ref_rhs_scales.data() + scale_offset, - ref_rhs_scales_stride, imp_packed_rhs.data() + rhs_packed_offset, 0, ¶ms); + auto rhs_matmul_offset = ukernel_variant.interface.get_rhs_packed_offset(rhs_start_row, K, bl); + ASSERT_EQ(rhs_packed_offset, rhs_matmul_offset); - const auto dst_stride = N * sizeof(float); - const auto dst_offset = ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride); - const auto ref_dst_offset = rect.start_row() * dst_stride + rect.start_col() * sizeof(float); + const auto dst_stride_row = N * sizeof(float); + const auto dst_stride_col = sizeof(float); + const auto dst_offset = + ukernel_variant.interface.get_dst_offset(rect.start_row(), rect.start_col(), dst_stride_row); + const auto ref_dst_offset = rect.start_row() * dst_stride_row + rect.start_col() * dst_stride_col; ASSERT_EQ(dst_offset, ref_dst_offset); // Runs the GEMM micro-kernel. const auto imp_dst_size = ukernel_variant.interface.get_dst_size(M, N); ASSERT_EQ(imp_dst_size, ref_dst.size()); Buffer imp_dst(imp_dst_size); + ukernel_variant.interface.run_matmul( - rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, - imp_packed_rhs.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), - N * sizeof(float), sizeof(float), std::numeric_limits::lowest(), std::numeric_limits::max()); + rect.height(), rect.width(), K, bl, reinterpret_cast(imp_packed_lhs.data()) + lhs_matmul_offset, + reinterpret_cast(imp_packed_rhs.data()) + rhs_matmul_offset, + reinterpret_cast(imp_dst.data() + dst_offset), dst_stride_row, dst_stride_col, clamp_min, clamp_max); // Compares the output of the micro-kernels against the output of the reference implementation for the portion // tested. DefaultMismatchHandler handler(0, 0.1, 0, 0.05); DataFormat dst_format = DataFormat(DataType::FP32); - const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); + const auto success = + compare(reinterpret_cast(imp_dst.data()), ref_dst.data(), dst_format, M, N, rect, handler); ASSERT_TRUE(success); + + // Test vectorized packing functions, if packing parameters allow + if (rhs_pack_type == RhsPackType::NxK && (kr / sr == 8 || kr / sr == 4)) { + const auto [imp_packed_rhs_neon, rhs_packed_offset_neon] = pack_rhs_qsi4c32pscalebf16_neon( + N, K, bl, nr, kr, sr, ref_rhs_values_qsi4, ref_biases, bias_offset, ref_rhs_scales, rhs_pack_type, + rhs_start_row, rect.width()); + ASSERT_EQ(rhs_packed_offset_neon, rhs_packed_offset); + + ukernel_variant.interface.run_matmul( + rect.height(), rect.width(), K, bl, imp_packed_lhs.data() + lhs_matmul_offset, + imp_packed_rhs_neon.data() + rhs_matmul_offset, reinterpret_cast(imp_dst.data() + dst_offset), + dst_stride_row, dst_stride_col, clamp_min, clamp_max); + + const auto success = compare(imp_dst.data(), ref_dst.data(), dst_format, M, N, rect, handler); + ASSERT_TRUE(success); + } } INSTANTIATE_TEST_SUITE_P( - MatMul, MatMulTest_f32_qmatmul_clamp_f32_qai8dxp_qsi4c32p, + MatMul, MatMulTest_qmatmul_clamp_f32_qai8dxp_qsi4c32p, testing::Combine( testing::Range(0, variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.size()), testing::Values( @@ -490,20 +351,21 @@ INSTANTIATE_TEST_SUITE_P( MatMulShape{1, 25, 64}), testing::Values(32, 64), testing::Values( - MatrixPortion(0, 0, 1, 1), // Full matrix. - MatrixPortion(0, 0, 1, 0.25f), // Leftmost portion. - MatrixPortion(0, 0.75f, 1, 1), // Rightmost portion. - MatrixPortion(0, 0.5f, 1, 0.8f) // Somewhere Middle - )), + MatrixPortion(0, 0, 1, 1), // Full matrix. + MatrixPortion(0, 0, 1, 0.25f), // Leftmost portion. + MatrixPortion(0, 0.75f, 1, 1), // Rightmost portion. + MatrixPortion(0, 0.5f, 1, 0.8f)), // Somewhere Middle + testing::Values(RhsPackType::NxK, RhsPackType::KxN)), [](const auto& info) { const auto variant_idx = std::get<0>(info.param); const std::string name{variants_kai_matmul_clamp_f32_qai8dxp_qsi4c32p.at(variant_idx).name}; const auto shape = std::get(info.param); const auto bl = std::get<2>(info.param); const auto portion = std::get<3>(info.param); + const RhsPackType rhs_pack_type = std::get<4>(info.param); std::ostringstream sstream; - sstream << name << "__"; + sstream << name << ((rhs_pack_type == RhsPackType::NxK) ? "__NxK" : "__KxN") << "__"; PrintTo(shape, &sstream); sstream << "__BL_" << bl << "__"; PrintTo(portion, &sstream);