From 77b859a9df4b19d9292524e2f056870ea31c35de Mon Sep 17 00:00:00 2001 From: Philip Hall Date: Wed, 8 Jan 2025 10:17:42 +0000 Subject: [PATCH] MLBEDSW-10205: Fix Ethos-U55 issue with 32-bit copy - An identity transpose TOSA op was replaced with a memory copy for Ethos-U55, but the underlying AveragePool implementation could not copy the 32-bit tensors as defined. This commit re-frames the tensor memory as twice as many 16-bit elements. - Transposed tensors that exceed the Ethos-U55 HW size limits cannot be trivially decomposed due to using AveragePool for the underlying implementation. Added asserts to catch oversized transposes passing through the command generator. Signed-off-by: Philip Hall Change-Id: Ic80acd32156f8d2d3980e5e43d9344efbefb3494 --- .../ethos_u55_register_cs_generator.cpp | 56 +++++++++++++++---- ethosu/regor/common/box.hpp | 5 +- ethosu/regor/common/shape.hpp | 7 ++- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp index e7a378ee..0d6b3f8b 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -992,8 +992,12 @@ void EthosU55RCSGenerator::GenerateOFM(OpType opType, const HLCFeatureMap &fm, c Emit(isa::npu_set_ofm_base2_t(tiles.address[2])); Emit(isa::npu_set_ofm_base3_t(tiles.address[3])); // OFM size - Emit(isa::npu_set_ofm_height_m1_t(DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1)); - Emit(isa::npu_set_ofm_width_m1_t(DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1)); + unsigned heightM1 = DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1; + unsigned widthM1 = DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1; + assert(isa::npu_set_ofm_height_m1_t(heightM1).get_height_m1() == heightM1); + assert(isa::npu_set_ofm_width_m1_t(widthM1).get_width_m1() == widthM1); + Emit(isa::npu_set_ofm_height_m1_t(heightM1)); + Emit(isa::npu_set_ofm_width_m1_t(widthM1)); // Tile related registers Emit(isa::npu_set_ofm_height0_m1_t(tiles.height0 - 1)); Emit(isa::npu_set_ofm_height1_m1_t(tiles.height1 - 1)); @@ -1264,6 +1268,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo assert(ifm.format == TensorFormat::NHWC); assert(ofm.format == TensorFormat::NHWC); assert(ifm.shape.Size() <= 4); + assert(((ofm.transpose == TransposeType::NWHC) || !ifm.slice.shape || (ifm.shape == ifm.slice.shape)) && "Implementation cannot be sliced"); ifm.shape = Shape::PadAxes(ifm.shape, 4, 0); Shape outShape = ifm.shape.Permute(unsigned(ofm.transpose)); @@ -1308,6 +1313,8 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo if ( NonZeroNybbles(swapMask) == 2 ) { int elementSize = DataTypeSizeBits(ofm.dataType) / 8; + // Activation element size must be supported since EthosU55 ignores channel stride for NHWC format tensors + assert(elementSize <= 2); // Can only swap 2 axes at once using this method int from; @@ -1315,18 +1322,20 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo from = ifm.shape.Size() - 1 - from; to = ifm.shape.Size() - 1 - to; + // May be decomposing NWHC in depth + Shape sliceShape = ifm.slice.shape ? ifm.slice.shape : ifm.shape; + // Place the swappable axes in H/W (works in elements here) - outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], 1); int depth = 1, slices = 1; int ifmStep = 0; int ofmStep = 0; // Not all elements participate in the transposed axes - if ( outFM.shape.Elements() != ifm.shape.Elements() ) + if ( (sliceShape[from] * sliceShape[to]) != sliceShape.Elements() ) { if ( ofm.transpose == TransposeType::NWHC ) { - depth = ifm.shape.Depth(); + depth = sliceShape.Depth(); slices = 1; ifmStep = ofmStep = 0; assert(from == 1 && to == 2); @@ -1355,9 +1364,12 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo // Recalculate destination as same as source but with output different strides outFM.shape = Shape(1, ifm.shape[from], ifm.shape[to], depth * elementSize); inFM.shape = outFM.shape; - // Shapes are measured in terms of bytes, not elements. - outFM.dataType = DataType::Int8; - inFM.dataType = DataType::Int8; + // Input address (potential depth slices) + inFM.address = AddressForCoordinate(ifm, ifm.strides, ifm.slice.offset); + inFM.slice.offset = inFM.slice.offset.WithZeros(); + // Output address (potential depth slices) + outFM.address = AddressForCoordinate(ofm, ofm.strides, ofm.slice.offset); + outFM.slice.offset = outFM.slice.offset.WithZeros(); // Special case for IFM with sparse strides if ( (slices > 1) && (ofm.transpose == TransposeType::NCWH) ) @@ -1368,7 +1380,7 @@ void EthosU55RCSGenerator::InsertTransposeCommand(const HLCStripe *stripe, Tempo else { outFM.strides = Shape(1, elementSize * depth, elementSize * depth * outFM.shape.Height(), elementSize); - inFM.strides = Shape::GetStridesForShape(inFM.shape, 1); + inFM.strides = Shape::GetStridesForShape(inFM.shape, elementSize); } // Repeat the transpose at advancing offsets for each slice @@ -1525,6 +1537,8 @@ void EthosU55RCSGenerator::GeneratePoolingOp(const HLCStripe *stripe, MemoryAcce auto pad = stripe->padding; auto padSum = pad.top + pad.left + pad.bottom + pad.right; bool useGlobalScale = !op->scales; + HLCStripe modifiedStripe(nullptr); + if ( _arch->UseAvgPoolNop(op->type) ) { assert(op->kernel.Size() == Point2i(1, 1)); @@ -1532,6 +1546,28 @@ void EthosU55RCSGenerator::GeneratePoolingOp(const HLCStripe *stripe, MemoryAcce assert(op->kernel.Dilation() == Point2i(1, 1)); assert(op->kernel.DepthMultiplier() == 1); assert(useGlobalScale); + assert(op->ifm.size() > 0); + // Op is being used as a 32-bit unscaled memory copy but + // we do not support more than 16-bit activations so adjust + // the tensor types and strides. + if ( op->type == OpType::MemoryCopy && (op->ifm[0].dataType == op->ofm.dataType) && DataTypeSizeBits(op->ofm.dataType) == 32 ) + { + assert(op->ifm[0].format == TensorFormat::NHWC); + assert(op->ofm.format == TensorFormat::NHWC); + modifiedStripe = *stripe; + op->ifm[0].dataType = DataType::Int16; + op->ifm[0].shape[-1] *= 2; + op->ifm[0].strides[-1] /= 2; + modifiedStripe.ifmAreas[0].Start() = modifiedStripe.ifmAreas[0].Start() * Shape(2); + modifiedStripe.ifmAreas[0].End() = modifiedStripe.ifmAreas[0].End() * Shape(2); + + op->ofm.dataType = DataType::Int16; + op->ofm.shape[-1] *= 2; + op->ofm.strides[-1] /= 2; + modifiedStripe.ofmArea.Start() = modifiedStripe.ofmArea.Start() * Shape(2); + modifiedStripe.ofmArea.End() = modifiedStripe.ofmArea.End() * Shape(2); + stripe = &modifiedStripe; + } } GenerateCommon(stripe, useGlobalScale, RCSIfmScaleMode::OPA_OPB_16, memoryAccesses); GenerateOFMScalingForPooling(op, useGlobalScale); diff --git a/ethosu/regor/common/box.hpp b/ethosu/regor/common/box.hpp index f4c957d1..2e6eb729 100644 --- a/ethosu/regor/common/box.hpp +++ b/ethosu/regor/common/box.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -41,6 +41,9 @@ public: Box(const Shape &end) : Box(end.WithZeros(), end) {} + Shape &Start() { return _start; } + Shape &End() { return _end; } + const Shape &Start() const { return _start; } const Shape &End() const { return _end; } diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp index e6753274..1a2fa000 100644 --- a/ethosu/regor/common/shape.hpp +++ b/ethosu/regor/common/shape.hpp @@ -1,5 +1,5 @@ // -// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2021-2025 Arm Limited and/or its affiliates // // SPDX-License-Identifier: Apache-2.0 // @@ -228,6 +228,11 @@ public: Shape operator/(const Shape &other) const { return Shape::MaxFunc, false, 1>(*this, other); } + Shape operator*(const Shape &other) const + { + return Shape::MaxFunc, false, 1>(*this, other); + } + Shape operator*(int scale) const { return Shape::ScalarFunc>(*this, scale); } Shape operator/(int scale) const { return Shape::ScalarFunc>(*this, scale); } -- GitLab