From 016c105f23bcefbf90765840e9e2d8b3746e8ca7 Mon Sep 17 00:00:00 2001
From: Philip Hall <philip.hall@arm.com>
Date: Wed, 5 Mar 2025 12:47:17 +0000
Subject: [PATCH 1/2] MLBEDSW-10106: Update Ethos-U55 MatMul performance stats

 - Updated performance calculations for the Ethos-U55 MatMul
   implementation. This is required to maintain the
   Ethos-U55/Ethos-U85 abstraction (both must return a result)
   when using the performance interface.
 - Fixed incomplete implementation of encoded weights byte
   transfer values.
 - Replaced manual datatype related scaling to use the DataType
   scaling functions.

Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: I7c8deb4e2740518874530786481d4ef57822bac4
-- 
GitLab


From 1367029e25cfa2ac82ec838c6d7a2f983c14756f Mon Sep 17 00:00:00 2001
From: Philip Hall <philip.hall@arm.com>
Date: Thu, 13 Mar 2025 16:19:11 +0000
Subject: [PATCH 2/2] MLBEDSW-10106: Improve Ethos-U55 MatMul performance

Small MatMul operations suffer from internal
dependency between the mul and sum operations
that reduces performance.
This change ensures that an ofm is divided into
at least 2 blocks to reduce or remove that
dependency.

Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: I102cbb2c865c84ea6e1cf577bc174d0936898a30
---
 .../regor/architecture/ethosu55/ethos_u55.cpp | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
index d03ff1a2..3fea9e9f 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
@@ -180,6 +180,20 @@ void ArchEthosU55::ApplyConfig(const AcceleratorConfig *cfg)
     _rcsGenerator = std::make_unique<EthosU55RCSGenerator>(this);
 }
 
+static Shape MatMulDependencyFit(const Shape &shape, int minSize, const Shape &blockLimit)
+{
+    // Attempt to fit multiple blocks in W/H to reduce block
+    // dependency stalls
+    int axis = (shape.Height() > blockLimit.Height()) ? -3 : -2;
+    if ( shape[axis] <= blockLimit[axis] )
+    {
+        for ( int divider = 3; divider > 1; divider-- )
+        {
+            if ( shape[axis] >= (minSize * divider) ) return shape.With(axis, DivRoundUp(shape[axis], divider));
+        }
+    }
+    return shape;
+}
 
 std::unique_ptr<ArchitectureOpConfig> ArchEthosU55::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query)
 {
@@ -189,17 +203,19 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU55::GetOpConfig(OpType opType, c
         ArchitectureConfigQuery tmpQuery = query;
         Kernel unitKernel = Kernel::UnitKernel();
         int batches = query.ofmShape.Height();
+
         // Block configuration for the Elementwise Mul
         tmpQuery.kernel = &unitKernel;
         tmpQuery.ifmBits = query.ifmBits;
         tmpQuery.ifmShape[1] = Shape(1, batches, 1, query.ifmShape[1].Depth());
-        tmpQuery.ofmShape = query.ifmShape[0];
+        tmpQuery.ifmShape[0] = MatMulDependencyFit(query.ifmShape[0], 4, _ofmBlockMax);
+        tmpQuery.ofmShape = tmpQuery.ifmShape[0];
         tmpQuery.ofmFormat = TensorFormat::NHWC;
         tmpQuery.ofmBits = 32;
         tmpQuery.transpose = TransposeType::None;
         auto mulConfig = FindBlockConfig(OpType::Mul, tmpQuery);
         // Block configuration for the Reduced Sum
-        tmpQuery.ofmShape = Shape(1, batches, query.ifmShape[0].Width(), 1);
+        tmpQuery.ofmShape = MatMulDependencyFit(Shape(1, batches, query.ifmShape[0].Width(), 1), 4, _ofmBlockMax);
         tmpQuery.ofmBits = query.ofmBits;
         tmpQuery.ofmFormat = query.ofmFormat;
         auto reduceConfig = FindBlockConfig(OpType::ReduceSum, tmpQuery);
-- 
GitLab