From cb596442694dcad5911ac7d27e58e39b2000f8e2 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 31 May 2024 16:12:10 +0100 Subject: [PATCH] Fix doxygen format Signed-off-by: Gian Marco Iodice --- ..._f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h | 6 + ...ai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h | 181 ++++++++---------- ...ai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h | 181 ++++++++---------- ...2_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h | 181 ++++++++---------- ...2_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h | 181 ++++++++---------- ...2_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h | 181 ++++++++---------- ...2_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h | 181 ++++++++---------- .../pack/kai_lhs_quant_pack_qai8dxp_f32.h | 106 +++++----- .../kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h | 120 ++++++------ 9 files changed, 613 insertions(+), 705 deletions(-) diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h index d98e5442..4d668a09 100644 --- a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h +++ b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h @@ -17,6 +17,12 @@ extern "C" { #endif // __cplusplus +/// Micro-kernel dependencies +/// +/// -# kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon to pack the RHS matrix + +/// -------------------------------------------------- + /// Gets m step value. /// /// The starting row index must be divisible by `m_step`. diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h index 7334e72f..98dc5880 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. - * @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the destination offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix - * @param[in] n Number of columns in the destination (DST) matrix - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix +/// @param[in] n Number of columns in the destination (DST) matrix +/// +/// @return the DST size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. - * Output tile: (rows x cols) = 1 x 4 - * Accumulation performed in a single for loop: 64 - * Instruction used: dotprod - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. +/// Output tile: (rows x cols) = 1 x 4 +/// Accumulation performed in a single for loop: 64 +/// Instruction used: dotprod +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h index 69a16a3b..b7810898 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// @brief Function to get the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. - * @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix - * @param[in] n Number of columns in the destination (DST) matrix - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix +/// @param[in] n Number of columns in the destination (DST) matrix +/// +/// @return the destination size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. - * Output tile: (rows x cols) = 1 x 8 - * Accumulation performed in a single for loop: 64 - * Instruction used: dotprod - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. +/// Output tile: (rows x cols) = 1 x 8 +/// Accumulation performed in a single for loop: 64 +/// Instruction used: dotprod +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h index 8592d285..aec1ca0e 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4. - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. It must be a multiple of 4. - * @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 4. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix. - * @param[in] n Number of columns in the destination (DST) matrix. - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. - * Output tile: (rows x cols) = 4 x 4 - * Accumulation performed in a single for loop: 32 - * Instruction used: i8mm - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. +/// Output tile: (rows x cols) = 4 x 4 +/// Accumulation performed in a single for loop: 32 +/// Instruction used: i8mm +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h index c22b4d71..bc277b12 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Function to get the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qai8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8 - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qai8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8 +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4. - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. It must be a multiple of 8. - * @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 8. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 4. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix. - * @param[in] n Number of columns in the destination (DST) matrix. - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. - * Output tile: (rows x cols) = 8 x 4 - * Accumulation performed in a single for loop: 32 - * Instruction used: i8mm - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. +/// Output tile: (rows x cols) = 8 x 4 +/// Accumulation performed in a single for loop: 32 +/// Instruction used: i8mm +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h index 94c5f6c1..b9f06ea2 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 4. - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 4. +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. It must be a multiple of 4. - * @param[in] n_idx Column index in the DST matrix. It must be a multiple of 8. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 4. +/// @param[in] n_idx Column index in the DST matrix. It must be a multiple of 8. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix. - * @param[in] n Number of columns in the destination (DST) matrix. - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. - * Output tile: (rows x cols) = 4 x 8 - * Accumulation performed in a single for loop: 32 - * Instruction used: i8mm - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed. +/// Output tile: (rows x cols) = 4 x 8 +/// Accumulation performed in a single for loop: 32 +/// Instruction used: i8mm +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h index 480f3c5f..7c289441 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h @@ -14,124 +14,111 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @return the m step value - */ +/// Micro-kernel dependencies +/// +/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix +/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix + +/// -------------------------------------------------- + +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @return the m step value size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @return the n step - */ +/// Gets the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @return the n step size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to get the mr value, which must be used to pack the LHS matrix with - * the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel - * - * @return the mr value - */ +/// Gets the mr value, which must be used to pack the LHS matrix with +/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel +/// +/// @return the mr value size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to get the nr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the nr value - */ +/// Gets the nr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the nr value size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to get the kr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the kr value - */ +/// Gets the kr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the kr value size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to get the sr value, which must be used to pack the RHS matrix with - * the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel - * - * @return the sr value - */ +/// Gets the sr value, which must be used to pack the RHS matrix with +/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel +/// +/// @return the sr value size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8 - * @param[in] k Total number of columns in the LHS matrix (not packed). - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8 +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. - * @param[in] k The common dimension between the LHS and RHS matrix (K). - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8. +/// @param[in] k The common dimension between the LHS and RHS matrix (K). +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t n_idx, size_t k); -/** - * @brief Function to calculate the offset in bytes for the DST matrix - * - * @param[in] m_idx Row index in the DST matrix. It must be a multiple of 8. - * @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. - * @param[in] dst_stride The number of bytes in in each row of the DST matrix - * - * return the DST offset in bytes - */ +/// Gets the offset in bytes for the DST matrix +/// +/// @param[in] m_idx Row index in the DST matrix. It must be a multiple of 8. +/// @param[in] n_idx Column index in the DST matrix. It must be multiple of 8. +/// @param[in] dst_stride The number of bytes in in each row of the DST matrix +/// +/// @return the DST offset in bytes size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm( size_t m_idx, size_t n_idx, size_t dst_stride); -/** - * @brief Function to query the size in bytes for the destination matrix. - * - * @param[in] m Number of rows in the destination (DST) matrix. - * @param[in] n Number of columns in the destination (DST) matrix. - */ +/// Gets the size in bytes for the destination matrix. +/// +/// @param[in] m Number of rows in the destination (DST) matrix. +/// @param[in] n Number of columns in the destination (DST) matrix. +/// +/// @return the destination size in bytes size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m, size_t n); -/** - * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. - * - * LHS matrix: Signed 8-bit quantized asymmetric per-row (qau8dx) and packed - * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cx) and packed. - * Output tile: (rows x cols) = 8 x 8 - * Accumulation performed in a single for loop: 32 - * Instruction used: i8mm - * - * @param[in] m The number of output rows written. - * @param[in] n The number of output columns written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. - * @param[in] lhs_packed The LHS matrix packed. - * When the activation are dynamically quantized, you can obtain this matrix - * by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs - * both the dynamic quantization to 8-bit and activation packing in a single step. - * @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref - * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 - * @param[out] dst Result of the vector-by-matrix - * @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. - * @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) - * @param[in] scalar_min Min value used to clamp the final result. - * @param[in] scalar_max Max value used to clamp the final result. - */ +/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation. +/// +/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qau8dx) and packed +/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cx) and packed. +/// Output tile: (rows x cols) = 8 x 8 +/// Accumulation performed in a single for loop: 32 +/// Instruction used: i8mm +/// +/// @param[in] m The number of output rows written. +/// @param[in] n The number of output columns written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. +/// @param[in] lhs_packed The LHS matrix packed. +/// When the activation are dynamically quantized, you can obtain this matrix +/// by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs +/// both the dynamic quantization to 8-bit and activation packing in a single step. +/// @param[in] rhs_packed The RHS matrix packed, which is obtained by calling @ref +/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 +/// @param[out] dst Result of the vector-by-matrix +/// @param[in] dst_stride_row Stride in bytes between two rows of the DST matrix. +/// @param[in] dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float) +/// @param[in] scalar_min Min value used to clamp the final result. +/// @param[in] scalar_max Max value used to clamp the final result. void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm( size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max); diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h index 28bd1ac1..acba70cd 100644 --- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h +++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h @@ -12,72 +12,62 @@ extern "C" { #endif -/** - * @brief Function to get the m step value. - * The micro-kernel can process any M values. However, the starting M index to - * be processed must be a multiple of m step. - * - * @param[in] mr The number of M rows to interleave on the same output row. - * - * @return the m step value - */ +/// Gets the m step value. +/// The micro-kernel can process any M values. However, the starting M index to +/// be processed must be a multiple of m step. +/// +/// @param[in] mr The number of M rows to interleave on the same output row. +/// +/// @return the m step value size_t kai_get_m_step_lhs_quant_pack_qai8dxp_f32(size_t mr); -/** - * @brief Function to calculate the offset in bytes for the LHS matrix (not packed) - * - * This function should be called before passing the pointer to the LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). - * @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed) - * - * return the offset in bytes to the LHS matrix - */ +/// Gets the offset in bytes for the LHS matrix (not packed) +/// +/// This function should be called before passing the pointer to the LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). +/// @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed) +/// +/// @return the offset in bytes to the LHS matrix size_t kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(size_t m_idx, size_t lhs_stride); -/** - * @brief Function to calculate the offset in bytes for the packed LHS matrix, - * which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. - * - * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. - * - * @param[in] m_idx Row index in the LHS matrix (not packed). - * @param[in] k Total number of columns in the LHS matrix (not packed). - * @param[in] mr The number of M rows to interleave on the same output row. - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * - * return the offset in bytes to the packed LHS matrix - */ +/// Gets the offset in bytes for the packed LHS matrix, +/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values. +/// +/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel. +/// +/// @param[in] m_idx Row index in the LHS matrix (not packed). +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// @param[in] mr The number of M rows to interleave on the same output row. +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// +/// @return the offset in bytes to the packed LHS matrix size_t kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr); -/** - * @brief Function to return the memory required for storing the quantized and packed LHS matrix - * - * @param[in] m Total number of rows in the LHS matrix (not packed). - * @param[in] k Total number of columns in the LHS matrix (not packed). - * @param[in] mr The number of M rows to interleave on the same output row. - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * - * return the size in bytes to the packed LHS matrix - */ +/// Gets the size in bytes for the quantized and packed LHS matrix +/// +/// @param[in] m Total number of rows in the LHS matrix (not packed). +/// @param[in] k Total number of columns in the LHS matrix (not packed). +/// @param[in] mr The number of M rows to interleave on the same output row. +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// +/// @return the packed LHS matrix size in bytes size_t kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(size_t m, size_t k, size_t mr, size_t kr, size_t sr); -/** - * @brief Micro-kernel to quantize and pack the LHS matrix - * - * @param[in] m The number of output rows written. - * @param[in] k The number of channels. The common dimension of LHS & RHS. It must be multiple of 8. - * @param[in] mr The number of M rows to interleave on the same output row. - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * However, kr must be multiple of sr. - * @param[in] m_idx_start The starting M index. - * @param[in] lhs LHS of the vector-by-matrix. - * @param[in] lhs_stride Stride in bytes between two rows of LHS. - * @param[out] lhs_packed The quantized and packed LHS matrix. - */ +/// Run the micro-kernel to quantize and pack the LHS matrix. +/// +/// @param[in] m The number of output rows written. +/// @param[in] k The number of channels. The common dimension of LHS & RHS. It must be multiple of 8. +/// @param[in] mr The number of M rows to interleave on the same output row. +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// However, kr must be multiple of sr. +/// @param[in] m_idx_start The starting M index. +/// @param[in] lhs LHS of the vector-by-matrix. +/// @param[in] lhs_stride Stride in bytes between two rows of LHS. +/// @param[out] lhs_packed The quantized and packed LHS matrix. void kai_run_lhs_quant_pack_qai8dxp_f32( size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride, void* lhs_packed); diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h index 1ce70c8b..bf947c1c 100644 --- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h +++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h @@ -17,80 +17,70 @@ struct kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0_params { uint8_t rhs_zero_point; }; -/** - * @brief Function to get the n step value. - * The micro-kernel can process any N values. However, the starting N index to - * be processed must be a multiple of n step. - * - * @param[in] nr The number of columns written by the matmul micro-kernel - * - * @return the n step value - */ +/// Get the n step value. +/// The micro-kernel can process any N values. However, the starting N index to +/// be processed must be a multiple of n step. +/// +/// @param[in] nr The number of columns written by the matmul micro-kernel +/// +/// @return the n step value size_t kai_get_n_step_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t nr); -/** - * @brief Function to calculate the offset in bytes for the RHS matrix (not packed), which holds - * the int4 values in a N x K matrix, where N is number of rows and K is the number of columns. - * Two int4 values are stored in one byte. The lower order part of the byte (low) holds - * the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1). - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step. - * @param[in] rhs_stride The number of bytes in in each row of the RHS matrix (not packed) - * - * return the offset in bytes to the RHS matrix (not packed) - */ +/// Gets the offset in bytes for the RHS matrix (not packed), which holds +/// the int4 values in a N x K matrix, where N is number of rows and K is the number of columns. +/// Two int4 values are stored in one byte. The lower order part of the byte (low) holds +/// the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1). +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step. +/// @param[in] rhs_stride The number of bytes in in each row of the RHS matrix (not packed) +/// +/// @return the offset in bytes to the RHS matrix (not packed) size_t kai_get_rhs_offset_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t n_idx, size_t rhs_stride); -/** - * @brief Function to calculate the offset in bytes for the packed RHS matrix, - * which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. - * - * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step. - * @param[in] k The common dimension between the LHS and RHS matrix (K) - * @param[in] nr The number of columns written by the matmul micro-kernel - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * - * return the offset in bytes to the packed RHS matrix - */ +/// Gets the offset in bytes for the packed RHS matrix, +/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values. +/// +/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step. +/// @param[in] k The common dimension between the LHS and RHS matrix (K) +/// @param[in] nr The number of columns written by the matmul micro-kernel +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// +/// @return the offset in bytes to the packed RHS matrix size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( size_t n_idx, size_t k, size_t nr, size_t kr, size_t sr); -/** - * @brief Function to return the memory required for storing the packed RHS matrix - * - * @param[in] n The number of rows in the RHS matrix (not packed) - * @param[in] k The number of columns in the RHS matrix (not packed). - * @param[in] nr The number of columns written by the matmul micro-kernel - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * - * return the size in bytes to the packed RHS matrix - */ +/// @brief Gets the size in bytes for the packed RHS matrix +/// +/// @param[in] n The number of rows in the RHS matrix (not packed) +/// @param[in] k The number of columns in the RHS matrix (not packed). +/// @param[in] nr The number of columns written by the matmul micro-kernel +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// +/// @return the packed RHS matrix size in bytes size_t kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t n, size_t k, size_t nr, size_t kr, size_t sr); -/** - * @brief Micro-kernel to pack the RHS matrix. - * - * @note The int4 values are stored in a N x K matrix, where N is number of rows and K is the number of columns. - * Two int4 values are stored in one byte. The lower order part of the byte (low) holds - * the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1). - * - * @param[in] num_groups The number of groups. It must be 1. - * @param[in] n The number of columns of the output matrix (N). - * @param[in] k The common dimension between the LHS and RHS matrix (K). It must be an even value. - * @param[in] nr The number of N columns to interleave on the same output output row. - * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. - * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. - * However, kr must be multiple of sr. - * @param[in] rhs The RHS matrix containing the 4-bit values. - * Size in bytes is expected to be greater than or equal to n * k * (sizeof(uint8_t) / 2). - * @param[in] bias The biases. - * @param[in] scale The scale for each output channel. - * @param[out] rhs_packed The packed RHS matrix. - * @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. - * @param[in] params Parameters for the micro-kernel. - */ +/// Run the micro-kernel to pack the RHS matrix. +/// +/// @note The int4 values are stored in a N x K matrix, where N is number of rows and K is the number of columns. +/// Two int4 values are stored in one byte. The lower order part of the byte (low) holds +/// the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1). +/// +/// @param[in] num_groups The number of groups. It must be 1. +/// @param[in] n The number of columns of the output matrix (N). +/// @param[in] k The common dimension between the LHS and RHS matrix (K). It must be an even value. +/// @param[in] nr The number of N columns to interleave on the same output output row. +/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel. +/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr. +/// However, kr must be multiple of sr. +/// @param[in] rhs The RHS matrix containing the 4-bit values. +/// Size in bytes is expected to be greater than or equal to n * k * (sizeof(uint8_t) / 2). +/// @param[in] bias The biases. +/// @param[in] scale The scale for each output channel. +/// @param[out] rhs_packed The packed RHS matrix. +/// @param[in] extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix. +/// @param[in] params Parameters for the micro-kernel. void kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0( size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const int32_t* bias, const float* scale, void* rhs_packed, size_t extra_bytes, -- GitLab