From cb596442694dcad5911ac7d27e58e39b2000f8e2 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 31 May 2024 16:12:10 +0100
Subject: [PATCH] Fix doxygen format

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 ..._f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h |   6 +
 ...ai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h | 181 ++++++++----------
 ...ai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h | 181 ++++++++----------
 ...2_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h | 181 ++++++++----------
 ...2_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h | 181 ++++++++----------
 ...2_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h | 181 ++++++++----------
 ...2_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h | 181 ++++++++----------
 .../pack/kai_lhs_quant_pack_qai8dxp_f32.h     | 106 +++++-----
 .../kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h     | 120 ++++++------
 9 files changed, 613 insertions(+), 705 deletions(-)

diff --git a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h
index d98e5442..4d668a09 100644
--- a/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h
+++ b/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h
@@ -17,6 +17,12 @@
 extern "C" {
 #endif  // __cplusplus
 
+/// Micro-kernel dependencies
+///
+/// -# kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon to pack the RHS matrix
+
+/// --------------------------------------------------
+
 /// Gets m step value.
 ///
 /// The starting row index must be divisible by `m_step`.
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h
index 7334e72f..98dc5880 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Gets the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// Gets the sr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed).
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed).
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed).
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed).
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix.
- * @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
- * @param[in] dst_stride The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix.
+/// @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
+/// @param[in] dst_stride The number of bytes in in each row of the DST matrix
+///
+/// @return the destination offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix
- * @param[in] n Number of columns in the destination (DST) matrix
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix
+/// @param[in] n Number of columns in the destination (DST) matrix
+///
+/// @return the DST size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
- * Output tile: (rows x cols) = 1 x 4
- * Accumulation performed in a single for loop: 64
- * Instruction used: dotprod
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
+/// Output tile: (rows x cols) = 1 x 4
+/// Accumulation performed in a single for loop: 64
+/// Instruction used: dotprod
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h
index 69a16a3b..b7810898 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Gets the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// @brief Function to get the sr value, which must be used to pack the RHS matrix with
+///        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed).
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed).
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed).
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed).
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix.
- * @param[in] n_idx      Column index in the DST matrix. It must be multiple of 8.
- * @param[in] dst_stride The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix.
+/// @param[in] n_idx      Column index in the DST matrix. It must be multiple of 8.
+/// @param[in] dst_stride The number of bytes in in each row of the DST matrix
+///
+/// @return the DST offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix
- * @param[in] n Number of columns in the destination (DST) matrix
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix
+/// @param[in] n Number of columns in the destination (DST) matrix
+///
+/// @return the destination size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
- * Output tile: (rows x cols) = 1 x 8
- * Accumulation performed in a single for loop: 64
- * Instruction used: dotprod
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
+/// Output tile: (rows x cols) = 1 x 8
+/// Accumulation performed in a single for loop: 64
+/// Instruction used: dotprod
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h
index 8592d285..aec1ca0e 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Gets the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// Gets the sr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed).
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed).
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4.
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4.
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 4.
- * @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
- * @param[in] dst_stride The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 4.
+/// @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
+/// @param[in] dst_stride The number of bytes in in each row of the DST matrix
+///
+/// @return the DST offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix.
- * @param[in] n Number of columns in the destination (DST) matrix.
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix.
+/// @param[in] n Number of columns in the destination (DST) matrix.
+///
+/// @return the destination size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
- * Output tile: (rows x cols) = 4 x 4
- * Accumulation performed in a single for loop: 32
- * Instruction used: i8mm
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
+/// Output tile: (rows x cols) = 4 x 4
+/// Accumulation performed in a single for loop: 32
+/// Instruction used: i8mm
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h
index c22b4d71..bc277b12 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Function to get the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// Gets the sr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qai8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qai8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4.
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 4.
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 8.
- * @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
- * @param[in] dst_stride  The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 8.
+/// @param[in] n_idx      Column index in the DST matrix. It must be multiple of 4.
+/// @param[in] dst_stride  The number of bytes in in each row of the DST matrix
+///
+/// @return the DST offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix.
- * @param[in] n Number of columns in the destination (DST) matrix.
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix.
+/// @param[in] n Number of columns in the destination (DST) matrix.
+///
+/// @return the destination size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
- * Output tile: (rows x cols) = 8 x 4
- * Accumulation performed in a single for loop: 32
- * Instruction used: i8mm
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
+/// Output tile: (rows x cols) = 8 x 4
+/// Accumulation performed in a single for loop: 32
+/// Instruction used: i8mm
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h
index 94c5f6c1..b9f06ea2 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Gets the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// Gets the sr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 4.
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 4.
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8.
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8.
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 4.
- * @param[in] n_idx      Column index in the DST matrix. It must be a multiple of 8.
- * @param[in] dst_stride The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 4.
+/// @param[in] n_idx      Column index in the DST matrix. It must be a multiple of 8.
+/// @param[in] dst_stride The number of bytes in in each row of the DST matrix
+///
+/// @return the DST offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix.
- * @param[in] n Number of columns in the destination (DST) matrix.
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix.
+/// @param[in] n Number of columns in the destination (DST) matrix.
+///
+/// @return the destination size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
- * Output tile: (rows x cols) = 4 x 8
- * Accumulation performed in a single for loop: 32
- * Instruction used: i8mm
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qai8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsu4cx) and packed.
+/// Output tile: (rows x cols) = 4 x 8
+/// Accumulation performed in a single for loop: 32
+/// Instruction used: i8mm
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h
index 480f3c5f..7c289441 100644
--- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h
+++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h
@@ -14,124 +14,111 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @return the m step value
- */
+/// Micro-kernel dependencies
+///
+/// -# kai_lhs_quant_pack_qai8dxp_f32 to dynamically quantize and pack the LHS matrix
+/// -# kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 to pack the RHS matrix
+
+/// --------------------------------------------------
+
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @return the m step value
 size_t kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @return the n step
- */
+/// Gets the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @return the n step
 size_t kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the mr value, which must be used to pack the LHS matrix with
- *        the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
- *
- * @return the mr value
- */
+/// Gets the mr value, which must be used to pack the LHS matrix with
+/// the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel
+///
+/// @return the mr value
 size_t kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the nr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the nr value
- */
+/// Gets the nr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the nr value
 size_t kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the kr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the kr value
- */
+/// Gets the kr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the kr value
 size_t kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to get the sr value, which must be used to pack the RHS matrix with
- *        the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
- *
- * @return the sr value
- */
+/// Gets the sr value, which must be used to pack the RHS matrix with
+/// the @ref kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0 micro-kernel
+///
+/// @return the sr value
 size_t kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(void);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed). It must be a multiple of 8
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8.
- * @param[in] k     The common dimension between the LHS and RHS matrix (K).
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of 8.
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K).
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t n_idx, size_t k);
 
-/**
- * @brief Function to calculate the offset in bytes for the DST matrix
- *
- * @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 8.
- * @param[in] n_idx      Column index in the DST matrix. It must be multiple of 8.
- * @param[in] dst_stride The number of bytes in in each row of the DST matrix
- *
- * return the DST offset in bytes
- */
+/// Gets the offset in bytes for the DST matrix
+///
+/// @param[in] m_idx      Row index in the DST matrix. It must be a multiple of 8.
+/// @param[in] n_idx      Column index in the DST matrix. It must be multiple of 8.
+/// @param[in] dst_stride The number of bytes in in each row of the DST matrix
+///
+/// @return the DST offset in bytes
 size_t kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(
     size_t m_idx, size_t n_idx, size_t dst_stride);
 
-/**
- * @brief Function to query the size in bytes for the destination matrix.
- *
- * @param[in] m Number of rows in the destination (DST) matrix.
- * @param[in] n Number of columns in the destination (DST) matrix.
- */
+/// Gets the size in bytes for the destination matrix.
+///
+/// @param[in] m Number of rows in the destination (DST) matrix.
+/// @param[in] n Number of columns in the destination (DST) matrix.
+///
+/// @return the destination size in bytes
 size_t kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(size_t m, size_t n);
 
-/**
- * @brief Function to run the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
- *
- * LHS matrix: Signed 8-bit quantized asymmetric per-row (qau8dx) and packed
- * RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cx) and packed.
- * Output tile: (rows x cols) = 8 x 8
- * Accumulation performed in a single for loop: 32
- * Instruction used: i8mm
- *
- * @param[in]  m              The number of output rows written.
- * @param[in]  n              The number of output columns written.
- * @param[in]  k              The number of channels. The common dimension of LHS & RHS.
- * @param[in]  lhs_packed     The LHS matrix packed.
- *                            When the activation are dynamically quantized, you can obtain this matrix
- *                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
- *                            both the dynamic quantization to 8-bit and activation packing in a single step.
- * @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
- * kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
- * @param[out] dst            Result of the vector-by-matrix
- * @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
- * @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
- * @param[in]  scalar_min     Min value used to clamp the final result.
- * @param[in]  scalar_max     Max value used to clamp the final result.
- */
+/// Runs the matrix multiplication (matmul) micro-kernel followed by a clamp (min-max) operation.
+///
+/// LHS matrix: Signed 8-bit quantized asymmetric per-row (qau8dx) and packed
+/// RHS matrix: Signed 4-bit quantized symmetric per-channel (qsi4cx) and packed.
+/// Output tile: (rows x cols) = 8 x 8
+/// Accumulation performed in a single for loop: 32
+/// Instruction used: i8mm
+///
+/// @param[in]  m              The number of output rows written.
+/// @param[in]  n              The number of output columns written.
+/// @param[in]  k              The number of channels. The common dimension of LHS & RHS.
+/// @param[in]  lhs_packed     The LHS matrix packed.
+///                            When the activation are dynamically quantized, you can obtain this matrix
+///                            by calling the @ref kai_lhs_quant_pack_qai8dxp_f32 micro-kernel which performs
+///                            both the dynamic quantization to 8-bit and activation packing in a single step.
+/// @param[in]  rhs_packed     The RHS matrix packed, which is obtained by calling @ref
+/// kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0
+/// @param[out] dst            Result of the vector-by-matrix
+/// @param[in]  dst_stride_row Stride in bytes between two rows of the DST matrix.
+/// @param[in]  dst_stride_col Stride in bytes between two columns of the DST matrix. For now, it must be sizeof(float)
+/// @param[in]  scalar_min     Min value used to clamp the final result.
+/// @param[in]  scalar_max     Max value used to clamp the final result.
 void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm(
     size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, float* dst, size_t dst_stride_row,
     size_t dst_stride_col, float scalar_min, float scalar_max);
diff --git a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h
index 28bd1ac1..acba70cd 100644
--- a/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h
+++ b/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h
@@ -12,72 +12,62 @@
 extern "C" {
 #endif
 
-/**
- * @brief Function to get the m step value.
- *        The micro-kernel can process any M values. However, the starting M index to
- *        be processed must be a multiple of m step.
- *
- * @param[in] mr The number of M rows to interleave on the same output row.
- *
- * @return the m step value
- */
+/// Gets the m step value.
+/// The micro-kernel can process any M values. However, the starting M index to
+/// be processed must be a multiple of m step.
+///
+/// @param[in] mr The number of M rows to interleave on the same output row.
+///
+/// @return the m step value
 size_t kai_get_m_step_lhs_quant_pack_qai8dxp_f32(size_t mr);
 
-/**
- * @brief Function to calculate the offset in bytes for the LHS matrix (not packed)
- *
- * This function should be called before passing the pointer to the LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx      Row index in the LHS matrix (not packed).
- * @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed)
- *
- * return the offset in bytes to the LHS matrix
- */
+/// Gets the offset in bytes for the LHS matrix (not packed)
+///
+/// This function should be called before passing the pointer to the LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx      Row index in the LHS matrix (not packed).
+/// @param[in] lhs_stride The number of bytes in in each row of the LHS matrix (not packed)
+///
+/// @return the offset in bytes to the LHS matrix
 size_t kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32(size_t m_idx, size_t lhs_stride);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed LHS matrix,
- *        which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
- *
- * This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
- *
- * @param[in] m_idx Row index in the LHS matrix (not packed).
- * @param[in] k     Total number of columns in the LHS matrix (not packed).
- * @param[in] mr    The number of M rows to interleave on the same output row.
- * @param[in] kr    The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in] sr    The number of kr splits. It can be 1 (no splits) up to kr.
- *
- * return the offset in bytes to the packed LHS matrix
- */
+/// Gets the offset in bytes for the packed LHS matrix,
+/// which contains the packed 8-bit quantized asymmetric per-row (qa8dx) values.
+///
+/// This function should be called before passing the pointer to the packed LHS matrix to the micro-kernel.
+///
+/// @param[in] m_idx Row index in the LHS matrix (not packed).
+/// @param[in] k     Total number of columns in the LHS matrix (not packed).
+/// @param[in] mr    The number of M rows to interleave on the same output row.
+/// @param[in] kr    The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in] sr    The number of kr splits. It can be 1 (no splits) up to kr.
+///
+/// @return the offset in bytes to the packed LHS matrix
 size_t kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr);
 
-/**
- * @brief Function to return the memory required for storing the quantized and packed LHS matrix
- *
- * @param[in] m  Total number of rows in the LHS matrix (not packed).
- * @param[in] k  Total number of columns in the LHS matrix (not packed).
- * @param[in] mr The number of M rows to interleave on the same output row.
- * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr.
- *
- * return the size in bytes to the packed LHS matrix
- */
+/// Gets the size in bytes for the quantized and packed LHS matrix
+///
+/// @param[in] m  Total number of rows in the LHS matrix (not packed).
+/// @param[in] k  Total number of columns in the LHS matrix (not packed).
+/// @param[in] mr The number of M rows to interleave on the same output row.
+/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr.
+///
+/// @return the packed LHS matrix size in bytes
 size_t kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(size_t m, size_t k, size_t mr, size_t kr, size_t sr);
 
-/**
- * @brief Micro-kernel to quantize and pack the LHS matrix
- *
- * @param[in]  m           The number of output rows written.
- * @param[in]  k           The number of channels. The common dimension of LHS & RHS. It must be multiple of 8.
- * @param[in]  mr          The number of M rows to interleave on the same output row.
- * @param[in]  kr          The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in]  sr          The number of kr splits. It can be 1 (no splits) up to kr.
- *                         However, kr must be multiple of sr.
- * @param[in]  m_idx_start The starting M index.
- * @param[in]  lhs         LHS of the vector-by-matrix.
- * @param[in]  lhs_stride  Stride in bytes between two rows of LHS.
- * @param[out] lhs_packed  The quantized and packed LHS matrix.
- */
+/// Run the micro-kernel to quantize and pack the LHS matrix.
+///
+/// @param[in]  m           The number of output rows written.
+/// @param[in]  k           The number of channels. The common dimension of LHS & RHS. It must be multiple of 8.
+/// @param[in]  mr          The number of M rows to interleave on the same output row.
+/// @param[in]  kr          The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in]  sr          The number of kr splits. It can be 1 (no splits) up to kr.
+///                         However, kr must be multiple of sr.
+/// @param[in]  m_idx_start The starting M index.
+/// @param[in]  lhs         LHS of the vector-by-matrix.
+/// @param[in]  lhs_stride  Stride in bytes between two rows of LHS.
+/// @param[out] lhs_packed  The quantized and packed LHS matrix.
 void kai_run_lhs_quant_pack_qai8dxp_f32(
     size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs, size_t lhs_stride,
     void* lhs_packed);
diff --git a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h
index 1ce70c8b..bf947c1c 100644
--- a/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h
+++ b/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h
@@ -17,80 +17,70 @@ struct kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0_params {
     uint8_t rhs_zero_point;
 };
 
-/**
- * @brief Function to get the n step value.
- *        The micro-kernel can process any N values. However, the starting N index to
- *        be processed must be a multiple of n step.
- *
- * @param[in] nr The number of columns written by the matmul micro-kernel
- *
- * @return the n step value
- */
+/// Get the n step value.
+/// The micro-kernel can process any N values. However, the starting N index to
+/// be processed must be a multiple of n step.
+///
+/// @param[in] nr The number of columns written by the matmul micro-kernel
+///
+/// @return the n step value
 size_t kai_get_n_step_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t nr);
 
-/**
- * @brief Function to calculate the offset in bytes for the RHS matrix (not packed), which holds
- *        the int4 values in a N x K matrix, where N is number of rows and K is the number of columns.
- *        Two int4 values are stored in one byte. The lower order part of the byte (low) holds
- *        the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1).
- *
- * @param[in] n_idx      Row index in the RHS matrix (not packed). It must be a multiple of n_step.
- * @param[in] rhs_stride The number of bytes in in each row of the RHS matrix (not packed)
- *
- * return the offset in bytes to the RHS matrix (not packed)
- */
+/// Gets the offset in bytes for the RHS matrix (not packed), which holds
+/// the int4 values in a N x K matrix, where N is number of rows and K is the number of columns.
+/// Two int4 values are stored in one byte. The lower order part of the byte (low) holds
+/// the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1).
+///
+/// @param[in] n_idx      Row index in the RHS matrix (not packed). It must be a multiple of n_step.
+/// @param[in] rhs_stride The number of bytes in in each row of the RHS matrix (not packed)
+///
+/// @return the offset in bytes to the RHS matrix (not packed)
 size_t kai_get_rhs_offset_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t n_idx, size_t rhs_stride);
 
-/**
- * @brief Function to calculate the offset in bytes for the packed RHS matrix,
- *        which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
- *
- * @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step.
- * @param[in] k     The common dimension between the LHS and RHS matrix (K)
- * @param[in] nr    The number of columns written by the matmul micro-kernel
- * @param[in] kr    The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in] sr    The number of kr splits. It can be 1 (no splits) up to kr.
- *
- * return the offset in bytes to the packed RHS matrix
- */
+/// Gets the offset in bytes for the packed RHS matrix,
+/// which contains the packed 4-bit quantized symmetric per-channel (qsu4cx) values.
+///
+/// @param[in] n_idx Row index in the RHS matrix (not packed). It must be a multiple of n_step.
+/// @param[in] k     The common dimension between the LHS and RHS matrix (K)
+/// @param[in] nr    The number of columns written by the matmul micro-kernel
+/// @param[in] kr    The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in] sr    The number of kr splits. It can be 1 (no splits) up to kr.
+///
+/// @return the offset in bytes to the packed RHS matrix
 size_t kai_get_rhs_packed_offset_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(
     size_t n_idx, size_t k, size_t nr, size_t kr, size_t sr);
 
-/**
- * @brief Function to return the memory required for storing the packed RHS matrix
- *
- * @param[in] n The number of rows in the RHS matrix (not packed)
- * @param[in] k The number of columns in the RHS matrix (not packed).
- * @param[in] nr The number of columns written by the matmul micro-kernel
- * @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr.
- *
- * return the size in bytes to the packed RHS matrix
- */
+/// @brief Gets the size in bytes for the packed RHS matrix
+///
+/// @param[in] n The number of rows in the RHS matrix (not packed)
+/// @param[in] k The number of columns in the RHS matrix (not packed).
+/// @param[in] nr The number of columns written by the matmul micro-kernel
+/// @param[in] kr The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in] sr The number of kr splits. It can be 1 (no splits) up to kr.
+///
+/// @return the packed RHS matrix size in bytes
 size_t kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(size_t n, size_t k, size_t nr, size_t kr, size_t sr);
 
-/**
- * @brief Micro-kernel to pack the RHS matrix.
- *
- * @note  The int4 values are stored in a N x K matrix, where N is number of rows and K is the number of columns.
- *        Two int4 values are stored in one byte. The lower order part of the byte (low) holds
- *        the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1).
- *
- * @param[in]  num_groups  The number of groups. It must be 1.
- * @param[in]  n           The number of columns of the output matrix (N).
- * @param[in]  k           The common dimension between the LHS and RHS matrix (K). It must be an even value.
- * @param[in]  nr          The number of N columns to interleave on the same output output row.
- * @param[in]  kr          The number of columns loaded in the single inner most loop of the matmul micro-kernel.
- * @param[in]  sr          The number of kr splits. It can be 1 (no splits) up to kr.
- *                         However, kr must be multiple of sr.
- * @param[in]  rhs         The RHS matrix containing the 4-bit values.
- *                         Size in bytes is expected to be greater than or equal to n * k * (sizeof(uint8_t) / 2).
- * @param[in]  bias        The biases.
- * @param[in]  scale       The scale for each output channel.
- * @param[out] rhs_packed  The packed RHS matrix.
- * @param[in]  extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix.
- * @param[in]  params      Parameters for the micro-kernel.
- */
+/// Run the micro-kernel to pack the RHS matrix.
+///
+/// @note  The int4 values are stored in a N x K matrix, where N is number of rows and K is the number of columns.
+///        Two int4 values are stored in one byte. The lower order part of the byte (low) holds
+///        the first nibble (K-index + 0). The higher order of the byte holds the second nibble (K-index + 1).
+///
+/// @param[in]  num_groups  The number of groups. It must be 1.
+/// @param[in]  n           The number of columns of the output matrix (N).
+/// @param[in]  k           The common dimension between the LHS and RHS matrix (K). It must be an even value.
+/// @param[in]  nr          The number of N columns to interleave on the same output output row.
+/// @param[in]  kr          The number of columns loaded in the single inner most loop of the matmul micro-kernel.
+/// @param[in]  sr          The number of kr splits. It can be 1 (no splits) up to kr.
+///                         However, kr must be multiple of sr.
+/// @param[in]  rhs         The RHS matrix containing the 4-bit values.
+///                         Size in bytes is expected to be greater than or equal to n * k * (sizeof(uint8_t) / 2).
+/// @param[in]  bias        The biases.
+/// @param[in]  scale       The scale for each output channel.
+/// @param[out] rhs_packed  The packed RHS matrix.
+/// @param[in]  extra_bytes Extra bytes to append to the end of each row of the packed RHS matrix.
+/// @param[in]  params      Parameters for the micro-kernel.
 void kai_run_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0(
     size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, const uint8_t* rhs, const int32_t* bias,
     const float* scale, void* rhs_packed, size_t extra_bytes,
-- 
GitLab