From 20dfdd0b2db492687774236cddcf5884cf7d75b1 Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Wed, 5 Mar 2025 12:47:17 +0000 Subject: [PATCH 1/2] MLBEDSW-10106: Update Ethos-U55 MatMul performance stats - Updated performance calculations for the Ethos-U55 MatMul implementation. This is required to maintain the Ethos-U55/Ethos-U85 abstraction (both must return a result) when using the performance interface. - Fixed incomplete implementation of encoded weights byte transfer values. - Replaced manual datatype related scaling to use the DataType scaling functions. Signed-off-by: Philip Hall Change-Id: I7c8deb4e2740518874530786481d4ef57822bac4 -- GitLab From 022fe9a6ab30271979e6872731c2166c4b8b7918 Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Tue, 11 Mar 2025 17:40:56 +0000 Subject: [PATCH 2/2] MLBEDSW-10531: Allow no-op transpose to use NHCWB16 Ethos-U55 manually copies no-op transposes using the DMA which does not care about tensor format. - Prevent format asserts when a no-op transpose uses NHCWB16 format. - Run merge pass after reduction pass to improve removal of no-op transposes. Signed-off-by: Philip Hall Change-Id: I07e1a9098257e58757b46bb78b402b19f8b07310 --- .../ethosu55/ethos_u55_register_cs_generator.cpp | 14 +++++++++++--- ethosu/regor/compiler/graphir_optimiser.hpp | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index 098f9757..3bd11ed5 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -1302,8 +1302,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo auto &ofm = op->ofm; assert(op->subOps.empty()); - assert(ifm.format == TensorFormat::NHWC); - assert(ofm.format == TensorFormat::NHWC); + assert(ifm.dataType == ofm.dataType); assert(((ofm.transpose == TransposeType::NWHC) || !ifm.slice.shape || (ifm.shape == ifm.slice.shape)) && "Implementation cannot be sliced"); ifm.shape = Shape::PadAxes(ifm.shape, 4, 1); assert((ifm.shape.AxisProduct(0, ifm.shape.Size() - 3) <= 1) && "Batch transposes unsupported"); @@ -1316,10 +1315,16 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo if ( identity ) { LOG_WARN("RCS: Emitting no-op transpose as a memory copy\n"); + assert(ifm.format == ofm.format); auto dma = std::make_unique(); dma->srcMemArea = ifm.memArea; dma->srcAddress = ifm.address; - dma->length = DataTypeStorageSizeBytes(ifm.dataType, ifm.shape.Elements()); + int elements = ofm.shape.Elements(); + if ( ifm.format == TensorFormat::NHCWB16 ) + { + elements = (elements / ofm.shape.Depth()) * RoundAway(ofm.shape.Depth(), 16); + } + dma->length = DataTypeStorageSizeBytes(ofm.dataType, elements); dma->destMemArea = ofm.memArea; dma->destAddress = ofm.address; emitted.push_back(dma.get()); @@ -1327,6 +1332,9 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo } else { + assert(ifm.format == TensorFormat::NHWC); + assert(ofm.format == TensorFormat::NHWC); + // Strided output on AveragePool can swap Height/Width over any channel depth by // adjusting the output strides to place the channel arrays in the required layout. // diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp index e54590a9..0294ed01 100644 --- a/ethosu/regor/compiler/graphir_optimiser.hpp +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -149,7 +149,6 @@ private: &GraphIrOptimiser::RewriteDepthwise, &GraphIrOptimiser::RewriteTransposeConvOFMPadding, &GraphIrOptimiser::OptimiseElementwise, - &GraphIrOptimiser::MergeTransposes, &GraphIrOptimiser::RearrangeTranspose, &GraphIrOptimiser::ReshapeReverse, &GraphIrOptimiser::UnrollConv @@ -160,6 +159,7 @@ private: { {}, { + &GraphIrOptimiser::MergeTransposes, &GraphIrOptimiser::MoveSplitSliceToConsumer } }, -- GitLab