From ba0aaaca1b0c713c6d53398fca11313104714699 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 31 May 2024 10:51:21 +0100
Subject: [PATCH] Update README.md and fix int4 matmul example

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
---
 README.md                                     | 23 ++++++++--
 .../CMakeLists.txt                            | 42 +++++++++----------
 .../matmul_clamp_f32_qai8dxp_qsi4cxp.cpp      |  6 +--
 3 files changed, 44 insertions(+), 27 deletions(-)
diff --git a/README.md b/README.md
index 3dd80cc4..eceab58a 100644
--- a/README.md
+++ b/README.md
@@ -72,11 +72,11 @@ Some of the key features of KleidiAI are the following:
 
 <h1> Filename convention </h1>
 
-The `src/` directory is the home for all micro-kernels. The micro-kernels are grouped in separate directories based on the performed operation. For example, all the matrix-multiplication micro-kernels are held in the `matmul/` operator directory.
+The `kai/ukernels` directory is the home for all micro-kernels. The micro-kernels are grouped in separate directories based on the performed operation. For example, all the matrix-multiplication micro-kernels are held in the `matmul/` operator directory.
 
 Inside the operator directory, you can find:
 
-- *The common micro-kernels*, which are helper micro-kernels necessary for the correct functioning of the main ones. For example, some of these may be required for packing the input tensors.
+- *The common micro-kernels*, which are helper micro-kernels necessary for the correct functioning of the main ones. For example, some of these may be required for packing the input tensors and held in the `pack` subdirectory.
 - *The micro-kernels* files, which are held in separate sub-directories.
 
 The name of the micro-kernel folder provides the description of the operation performed and the data type of the destination and source tensors. The general syntax for the micro-kernel folder is as follows:
@@ -100,10 +100,11 @@ Some of the data types currently supported with the KleidiAI library are the fol
 | Data type      | Abbreviation | Notes |
 | ----------- | ----------- | ----------- |
 | Floating-point 32-bit | <b>f32</b> | |
+| Floating-point 16-bit | <b>f16</b> | |
 | Quantized (q) Symmetric (s) Signed (u) 4-bit (4) Per-Channel (cx) quantization parameters | <b>qsi4cx</b> | An <b>fp32</b> multiplier shared among all values of the same channel. `x` denotes the entirety of the channel |
 | Quantized (q) Asymmetric (a) Signed (i) 8-bit (8) Per-Dimension (dx) (for example, Per-Row) quantization parameters | <b>qai8dx</b> | An <b>fp32</b> multiplier and a <b>int32</b> zero offset shared among all values of the same dimension. |
 
-> ℹ️ In some cases, we may append the letter `p` to the data type to specify that the tensor is expected to be <strong>packed</strong>. A packed tensor is a tensor that has been rearranged in our preferred data layout from the original data layout to improve the performance of the micro-kernel. In addition to the letter `p`, we may append other alphanumerical values to specify the attributes of the data packing (for example, the block packing size).
+> ℹ️ In some cases, we may append the letter `p` to the data type to specify that the tensor is expected to be <strong>packed</strong>. A packed tensor is a tensor that has been rearranged in our preferred data layout from the original data layout to improve the performance of the micro-kernel. In addition to the letter `p`, we may append other alphanumerical values to specify the attributes of the data packing (for example, the block packing size or the data type of for the additional packed arguments).
 
 <h1> Supported micro-kernels </h1>
 
@@ -131,6 +132,22 @@ Some of the data types currently supported with the KleidiAI library are the fol
         Since the RHS matrix often contains constant values, we recommend packing the RHS matrix only once and freeing the content of the original RHS matrix. <br>
     </td>
 </tr>
+<tr>
+    <td>Matrix-multiplication with RHS packed</td>
+    <td style="width:10%">matmul_clamp_f16_f16_f16p</td>
+    <td style="width:20%">
+        <b>LHS</b>: f16 <br>
+        <b>RHS</b>: f16p <br>
+        <b>DST</b>: f16 <br>
+    </td>
+    <td>
+        TensorFlow Lite <br>
+    </td>
+    <td>
+        The packing function for the RHS matrix is available in the `kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c/.h` files. <br>
+        Since the RHS matrix often contains constant values, we recommend packing the RHS matrix only once and freeing the content of the original RHS matrix. <br>
+    </td>
+</tr>
 <tr>
     <td>Dynamic quantization and LHS matrix packing</td>
     <td>kai_lhs_quant_pack_qai8dxp_f32</td>
diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt
index ccc8b840..cf80906e 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/CMakeLists.txt
@@ -8,31 +8,31 @@ cmake_minimum_required(VERSION 3.16)
 
 # KleidiAI include directories
 include_directories(
-    ../../src/
-    ../../src/matmul/
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/)
+    ../../
+    ../../kai/ukernels/matmul/pack/
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
 
 # Files requires to build the executable
 add_executable(matmul_clamp_f32_qai8dxp_qsi4cxp
     matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
-    ../../src/kai_common.h
-    ../../src/matmul/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h
-    ../../src/matmul/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.c
-    ../../src/matmul/kai_lhs_quant_pack_qai8dxp_f32.h
-    ../../src/matmul/kai_lhs_quant_pack_qai8dxp_f32.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h
-    ../../src/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c)
+    ../../kai/kai_common.h
+    ../../kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.h
+    ../../kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qsu4cxs1s0.c
+    ../../kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h
+    ../../kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h
+    ../../kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c)
 
diff --git a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
index 0f404cf0..bde6447c 100644
--- a/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
+++ b/examples/matmul_clamp_f32_qai8dxp_qsi4cxp/matmul_clamp_f32_qai8dxp_qsi4cxp.cpp
@@ -4,7 +4,6 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-// Include micro-kernel variants
 #include <cassert>
 #include <cfloat>
 #include <cmath>
@@ -12,6 +11,7 @@
 #include <iostream>
 #include <string>
 
+// Include micro-kernel variants
 #include "kai_lhs_quant_pack_qai8dxp_f32.h"
 #include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h"
@@ -351,8 +351,8 @@ int main(int argc, char** argv) {
     // Memory sizes for the reference implementation
     // After dynamically quantized the LHS matrix, we have the scale and offset for each
     // row. The scale (f32) and offset (int32) are stored at the beginning of each row
-    size_t lhs_ref_size_qa8dx = m * (k + sizeof(int32_t) + sizeof(float));
-    size_t dst_ref_size_f32 = m * n * sizeof(float);
+    const size_t lhs_ref_size_qa8dx = m * (k + sizeof(int32_t) + sizeof(float));
+    const size_t dst_ref_size_f32 = m * n * sizeof(float);
 
     uint8_t* lhs_ref_mtx_qa8dx = new uint8_t[lhs_ref_size_qa8dx];
     uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
-- 
GitLab