From ecb33b7d48bd651186df7cca607c4489bd8e4c10 Mon Sep 17 00:00:00 2001 From: Fredrik Svedberg Date: Tue, 13 May 2025 14:03:00 +0200 Subject: [PATCH] MLBEDSW-9759 TOSA AvgPool Decomposition * Added large dimensions decomposition * Added large stride decomposition * Implemented scaling for large kernels with padding and for large stride decomposition. Change-Id: Ibcf5c4f75f063231ad6444cb21df47bdba78fd2d Signed-off-by: Fredrik Svedberg --- .../architecture/architecture_constraints.hpp | 1 + .../ethosu55/ethos_u55_constraints.cpp | 5 + .../ethosu85/ethos_u85_constraints.cpp | 11 ++ .../ethos_u85_register_cs_generator.cpp | 2 +- ethosu/regor/common/box.hpp | 20 +++ ethosu/regor/compiler/scheduler_decompose.cpp | 128 +++++++++++++++++- ethosu/regor/test/CMakeLists.txt | 1 + ethosu/regor/test/test_box.cpp | 96 +++++++++++++ 8 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 ethosu/regor/test/test_box.cpp diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp index 0a84117c..e5f5964c 100644 --- a/ethosu/regor/architecture/architecture_constraints.hpp +++ b/ethosu/regor/architecture/architecture_constraints.hpp @@ -77,6 +77,7 @@ enum class ArchProperty DepthMultiplier = 1 << 4, TransposeMask = 1 << 5, ReduceAxis = 1 << 6, + Scaling = 1 << 7, }; struct ArchRequirements diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp index 47817262..f203354f 100644 --- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp @@ -572,6 +572,11 @@ Flags EthosU55Constraints::OperatorQuery(OpType opType, const ArchO } result.Set(QueryResult::HasRequirements); } + + if ( opType == OpType::AvgPool && (k->Size().x > 8 || k->Size().y > 8) && !k->Padding().IsZero() ) + { + return QueryResult::Unsupported; + } } else { diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp index 79c1314a..d2de7f90 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp @@ -530,6 +530,17 @@ Flags EthosU85Constraints::OperatorQuery(OpType opType, const ArchO } result.Set(QueryResult::HasRequirements); } + + if ( opType == OpType::AvgPool && (k->Size().x > 8 || k->Size().y > 8) && !k->Padding().IsZero() && + query->ofm.quantization.scales.size() ) + { + if ( req ) + { + req->req.Set(ArchRequirement::Decompose); + req->decomposeProps.Set(ArchProperty::Scaling); + } + result.Set(QueryResult::HasRequirements); + } } return result; diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp index c5d1fea8..322cc6c7 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp @@ -717,7 +717,7 @@ void EthosU85RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp, bo QuantizedScale ofmScale(1, 0); pooling_mode mode = (poolOp->type == OpType::AvgPool && (poolOp->kernel.Size().x > 8 || poolOp->kernel.Size().y > 8)) ? pooling_mode::SUM : pooling_mode::NONE; - if ( mode == pooling_mode::SUM && useGlobalScale ) + if ( mode == pooling_mode::SUM && useGlobalScale && poolOp->kernel.Padding().IsZero() ) { uint32_t scale = 1; int shift = 0; diff --git a/ethosu/regor/common/box.hpp b/ethosu/regor/common/box.hpp index 84377977..60f2b29b 100644 --- a/ethosu/regor/common/box.hpp +++ b/ethosu/regor/common/box.hpp @@ -71,5 +71,25 @@ public: return true; } + void Move(const Shape &delta) + { + assert(delta.Size() == _start.Size()); + _start += delta; + _end += delta; + } + + void MoveTo(const Shape &start) + { + assert(start.Size() == _start.Size()); + Move(start - _start); + } + + Box Intersection(const Box &other) + { + return Overlaps(other) ? Box(Shape::Max(_start, other._start), Shape::Min(_end, other._end)) : Box{}; + } + + bool operator==(const Box &other) const { return (_start == other._start) && (_end == other._end); } + std::string ToString() const { return fmt::format("[{} - {}]", _start.ToString(), _end.ToString()); } }; diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp index fce619f6..fe612ec2 100644 --- a/ethosu/regor/compiler/scheduler_decompose.cpp +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -20,6 +20,8 @@ #include "common/logging.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "common/box.hpp" #include "shape_util.hpp" #include @@ -2290,7 +2292,131 @@ std::vector> DecomposeAvgPool(Architecture * result.emplace_back(std::move(op)); return result; } - // Decomposition for large dimensions & strides is needed here. + + ArchRequirements req{}; + Flags qResult = OperatorQuery(arch, op.get(), &req); + + // Perform scaling of the output if needed + const int scaleSize = ofmConn->quantization.scales.size(); + if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::Scaling, ArchProperty::KernelStride) && scaleSize ) + { + // Create scaling array + int H = (kernel->Padding().Top() || kernel->Padding().Bottom()) ? ofmShape.Height() : 1; + int W = (kernel->Padding().Left() || kernel->Padding().Right()) ? ofmShape.Width() : 1; + int C = scaleSize > 1 ? ofmShape.Depth() : 1; + + // Create SchedulerTensor for scales and shifts + auto shape = Shape(1, H, W, C); + auto scaleTensor = std::make_shared(); + scaleTensor->uid = GenerateUniqueId(); + scaleTensor->memArea = arch->ReadonlyMemory(); + scaleTensor->dataType = DataType::Int32; + scaleTensor->storageShape = shape; + auto shiftTensor = scaleTensor->Clone(); + + // Create buffers that will hold scales and shifts + const auto size = shape.Elements(); + auto scaleBuffer = std::make_unique(size); + auto shiftBuffer = std::make_unique(size); + + // Calculate scales and shifts + auto ifmBox = Box(Shape{ifmShape.Height(), ifmShape.Width()}); + auto kernelBox = Box(Shape{kernel->Size().y, kernel->Size().x}); + const auto &ifmScales = ifmConn->quantization.scales; + const auto &ofmScales = ofmConn->quantization.scales; + int pos = 0; + for ( int y = 0; y < H; y++ ) + { + int iy = y * kernel->Stride().y - padding.Top(); + for ( int x = 0; x < W; x++ ) + { + int ix = x * kernel->Stride().x - padding.Left(); + kernelBox.MoveTo(Shape{iy, ix}); + assert(ifmBox.Overlaps(kernelBox)); + int elements = ifmBox.Intersection(kernelBox).SizeShape().Elements(); + assert(elements); + for ( int c = 0; c < C; c++ ) + { + double ifmScale = float(ifmScales[(c + ifmSlice.offset.Depth()) % ifmScales.size()].Dequantize()); + double ofmScale = float(ofmScales[(c + ofmSlice.offset.Depth()) % ofmScales.size()].Dequantize()); + double rescale = ifmScale / ofmScale; + // When there is only one kernel element and no rescale + // the effective shift will be zero and no rounding will be performed + // by the ASR, hence the scale needs to be initialized as below + uint32_t scale = 1 << 30; + int shift = 30; + if ( !(elements == 1 && rescale == 1.0) ) + QuantizePoolingScale(elements, rescale, 0, scale, shift, 31); + scaleBuffer[pos] = scale; + shiftBuffer[pos] = shift - 30; + pos++; + } + } + } + + // Hand over buffers to the scale and shift tensors + scaleTensor->bufferView = BufferView(std::make_shared(std::move(scaleBuffer), size), 0, 8 * sizeof(int32_t), shape, {}); + shiftTensor->bufferView = BufferView(std::make_shared(std::move(shiftBuffer), size), 0, 8 * sizeof(int32_t), shape, {}); + + // Setup intermediate tensors for scaling + auto mulIfm = std::make_shared(); + mulIfm->uid = GenerateUniqueId(); + mulIfm->memArea = arch->FeatureMapMemory(); + mulIfm->dataType = DataType::Int32; + mulIfm->storageShape = ofmShape; + auto mulOfm = mulIfm->Clone(); + + // Apply scales + auto mul = std::make_unique(OpType::Mul); + mul->ConnectInput(TensorUsage::IFM0, mulIfm)->shape = mulIfm->storageShape; + mul->ConnectInput(TensorUsage::IFM1, scaleTensor)->shape = scaleTensor->storageShape; + auto mulOfmConn = mul->ConnectOutput(TensorUsage::OFM, mulOfm); + mulOfmConn->shape = mulOfm->storageShape; + mulOfmConn->quantization.scales.emplace_back(QuantizedScale{1, 30}); + mulOfmConn->rounding = RoundMode::TRUNCATE_TO_LOWER; + auto mulOps = DecomposeElementwise(arch, std::move(mul)); + + // Apply shift + auto asr = std::make_unique(OpType::Asr); + asr->ConnectInput(TensorUsage::IFM0, mulOfm)->shape = mulOfm->storageShape; + asr->ConnectInput(TensorUsage::IFM1, shiftTensor)->shape = shiftTensor->storageShape; + *asr->ConnectOutput(TensorUsage::OFM, ofmConn->tensor) = *ofmConn; + asr->OFM()->quantization.scales = {QuantizedScale::Unit()}; + auto asrOps = DecomposeElementwise(arch, std::move(asr)); + + // Redirect ofm to perform scaling and set unit scaling + ofmConn = op->ConnectOutput(TensorUsage::OFM, mulIfm); + ofmConn->quantization = Quantization::Unit(); + // Remove scales to signal scaling is done elsewhere, i.e. with the MUL and ASR above + ofmConn->quantization.scales.clear(); + ofmConn->SetType(DataType::None); // Reset any data type on the connection, since the tensor has been replaced + auto subOps = DecomposeAvgPool(arch, std::move(op)); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + result.insert(result.end(), std::make_move_iterator(mulOps.begin()), std::make_move_iterator(mulOps.end())); + result.insert(result.end(), std::make_move_iterator(asrOps.begin()), std::make_move_iterator(asrOps.end())); + return result; + } + + // Decomposition for large dimensions + try + { + if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) ) + { + return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeAvgPool); + } + } + catch ( const DecompositionFailure & ) + { + UpdatePaddingAndIfmOffset(op.get()); + result.emplace_back(std::move(op)); + return result; + } + // Decomposition of large stride + if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && req.decomposeProps.Any(ArchProperty::KernelStride) ) + { + return DecomposeForStrides(arch, std::move(op), DecomposeAvgPool); + } + // If we get here, decomposition has failed, the resulting operations will be executed on CPU UpdatePaddingAndIfmOffset(op.get()); result.emplace_back(std::move(op)); diff --git a/ethosu/regor/test/CMakeLists.txt b/ethosu/regor/test/CMakeLists.txt index a9cb9fd0..69e38340 100644 --- a/ethosu/regor/test/CMakeLists.txt +++ b/ethosu/regor/test/CMakeLists.txt @@ -64,6 +64,7 @@ add_catch_test( test_custom_operator_ethosu.cpp test_tflite_supported_operators.cpp test_passthrough.cpp + test_box.cpp DEPS test_common ) diff --git a/ethosu/regor/test/test_box.cpp b/ethosu/regor/test/test_box.cpp new file mode 100644 index 00000000..9299d188 --- /dev/null +++ b/ethosu/regor/test/test_box.cpp @@ -0,0 +1,96 @@ +// +// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/box.hpp" + +#include + +static std::ostream &operator<<(std::ostream &os, const Shape &item) +{ + os << item.ToString(); + return os; +} + +static std::ostream &operator<<(std::ostream &os, const Box &item) +{ + os << item.ToString(); + return os; +} + +TEST_CASE("Box: tests") +{ + Box a({0, 0}, {3, 5}); + Box b({0, 0}, {2, 2}); + + SECTION("Construct") + { + REQUIRE(a.Start() == Shape{0, 0}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + REQUIRE(b.Start() == Shape{0, 0}); + REQUIRE(b.SizeShape() == Shape{2, 2}); + } + + SECTION("Move") + { + a.Move(Shape{-1, 1}); + REQUIRE(a.Start() == Shape{-1, 1}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + a.Move(Shape{1, 0}); + REQUIRE(a.Start() == Shape{0, 1}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + } + + SECTION("MoveTo") + { + a.MoveTo(Shape{5, 6}); + REQUIRE(a.Start() == Shape{5, 6}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + a.MoveTo(Shape{-3, 7}); + REQUIRE(a.Start() == Shape{-3, 7}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + a.MoveTo(Shape{0, 0}); + REQUIRE(a.Start() == Shape{0, 0}); + REQUIRE(a.SizeShape() == Shape{3, 5}); + } + + SECTION("Overlaps") + { + REQUIRE(a.Overlaps(b)); + b.MoveTo({3, 5}); + REQUIRE(!a.Overlaps(b)); + b.Move({-1, -1}); + REQUIRE(a.Overlaps(b)); + b.MoveTo({-2, -2}); + REQUIRE(!a.Overlaps(b)); + b.Move({1, 1}); + REQUIRE(a.Overlaps(b)); + } + + SECTION("Intersection") + { + REQUIRE(a.Intersection(b) == Box({0, 0}, Box::Size({2, 2}))); + b.MoveTo({3, 5}); + REQUIRE(a.Intersection(b) == Box{}); + b.Move({-1, -1}); + REQUIRE(a.Intersection(b) == Box({2, 4}, Box::Size({1, 1}))); + b.MoveTo({-1, -1}); + REQUIRE(a.Intersection(b) == Box({0, 0}, Box::Size({1, 1}))); + b.Move({2, 2}); + REQUIRE(a.Intersection(b) == Box({1, 1}, Box::Size({2, 2}))); + } +} -- GitLab