From 016c105f23bcefbf90765840e9e2d8b3746e8ca7 Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Wed, 5 Mar 2025 12:47:17 +0000 Subject: [PATCH 1/2] MLBEDSW-10106: Update Ethos-U55 MatMul performance stats - Updated performance calculations for the Ethos-U55 MatMul implementation. This is required to maintain the Ethos-U55/Ethos-U85 abstraction (both must return a result) when using the performance interface. - Fixed incomplete implementation of encoded weights byte transfer values. - Replaced manual datatype related scaling to use the DataType scaling functions. Signed-off-by: Philip Hall Change-Id: I7c8deb4e2740518874530786481d4ef57822bac4 -- GitLab From 1367029e25cfa2ac82ec838c6d7a2f983c14756f Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Thu, 13 Mar 2025 16:19:11 +0000 Subject: [PATCH 2/2] MLBEDSW-10106: Improve Ethos-U55 MatMul performance Small MatMul operations suffer from internal dependency between the mul and sum operations that reduces performance. This change ensures that an ofm is divided into at least 2 blocks to reduce or remove that dependency. Signed-off-by: Philip Hall Change-Id: I102cbb2c865c84ea6e1cf577bc174d0936898a30 --- .../regor/architecture/ethosu55/ethos_u55.cpp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp index d03ff1a2..3fea9e9f 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp @@ -180,6 +180,20 @@ void ArchEthosU55::ApplyConfig(const AcceleratorConfig *cfg) _rcsGenerator = std::make_unique(this); } +static Shape MatMulDependencyFit(const Shape &shape, int minSize, const Shape &blockLimit) +{ + // Attempt to fit multiple blocks in W/H to reduce block + // dependency stalls + int axis = (shape.Height() > blockLimit.Height()) ? -3 : -2; + if ( shape[axis] <= blockLimit[axis] ) + { + for ( int divider = 3; divider > 1; divider-- ) + { + if ( shape[axis] >= (minSize * divider) ) return shape.With(axis, DivRoundUp(shape[axis], divider)); + } + } + return shape; +} std::unique_ptr ArchEthosU55::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) { @@ -189,17 +203,19 @@ std::unique_ptr ArchEthosU55::GetOpConfig(OpType opType, c ArchitectureConfigQuery tmpQuery = query; Kernel unitKernel = Kernel::UnitKernel(); int batches = query.ofmShape.Height(); + // Block configuration for the Elementwise Mul tmpQuery.kernel = &unitKernel; tmpQuery.ifmBits = query.ifmBits; tmpQuery.ifmShape[1] = Shape(1, batches, 1, query.ifmShape[1].Depth()); - tmpQuery.ofmShape = query.ifmShape[0]; + tmpQuery.ifmShape[0] = MatMulDependencyFit(query.ifmShape[0], 4, _ofmBlockMax); + tmpQuery.ofmShape = tmpQuery.ifmShape[0]; tmpQuery.ofmFormat = TensorFormat::NHWC; tmpQuery.ofmBits = 32; tmpQuery.transpose = TransposeType::None; auto mulConfig = FindBlockConfig(OpType::Mul, tmpQuery); // Block configuration for the Reduced Sum - tmpQuery.ofmShape = Shape(1, batches, query.ifmShape[0].Width(), 1); + tmpQuery.ofmShape = MatMulDependencyFit(Shape(1, batches, query.ifmShape[0].Width(), 1), 4, _ofmBlockMax); tmpQuery.ofmBits = query.ofmBits; tmpQuery.ofmFormat = query.ofmFormat; auto reduceConfig = FindBlockConfig(OpType::ReduceSum, tmpQuery); -- GitLab