From ecb33b7d48bd651186df7cca607c4489bd8e4c10 Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Tue, 13 May 2025 14:03:00 +0200
Subject: [PATCH] MLBEDSW-9759 TOSA AvgPool Decomposition

* Added large dimensions decomposition
* Added large stride decomposition
* Implemented scaling for large kernels
  with padding and for large stride decomposition.

Change-Id: Ibcf5c4f75f063231ad6444cb21df47bdba78fd2d
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
---
 .../architecture/architecture_constraints.hpp |   1 +
 .../ethosu55/ethos_u55_constraints.cpp        |   5 +
 .../ethosu85/ethos_u85_constraints.cpp        |  11 ++
 .../ethos_u85_register_cs_generator.cpp       |   2 +-
 ethosu/regor/common/box.hpp                   |  20 +++
 ethosu/regor/compiler/scheduler_decompose.cpp | 128 +++++++++++++++++-
 ethosu/regor/test/CMakeLists.txt              |   1 +
 ethosu/regor/test/test_box.cpp                |  96 +++++++++++++
 8 files changed, 262 insertions(+), 2 deletions(-)
 create mode 100644 ethosu/regor/test/test_box.cpp
diff --git a/ethosu/regor/architecture/architecture_constraints.hpp b/ethosu/regor/architecture/architecture_constraints.hpp
index 0a84117c..e5f5964c 100644
--- a/ethosu/regor/architecture/architecture_constraints.hpp
+++ b/ethosu/regor/architecture/architecture_constraints.hpp
@@ -77,6 +77,7 @@ enum class ArchProperty
     DepthMultiplier = 1 << 4,
     TransposeMask = 1 << 5,
     ReduceAxis = 1 << 6,
+    Scaling = 1 << 7,
 };
 
 struct ArchRequirements
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
index 47817262..f203354f 100644
--- a/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_constraints.cpp
@@ -572,6 +572,11 @@ Flags<QueryResult> EthosU55Constraints::OperatorQuery(OpType opType, const ArchO
             }
             result.Set(QueryResult::HasRequirements);
         }
+
+        if ( opType == OpType::AvgPool && (k->Size().x > 8 || k->Size().y > 8) && !k->Padding().IsZero() )
+        {
+            return QueryResult::Unsupported;
+        }
     }
     else
     {
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
index 79c1314a..d2de7f90 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_constraints.cpp
@@ -530,6 +530,17 @@ Flags<QueryResult> EthosU85Constraints::OperatorQuery(OpType opType, const ArchO
             }
             result.Set(QueryResult::HasRequirements);
         }
+
+        if ( opType == OpType::AvgPool && (k->Size().x > 8 || k->Size().y > 8) && !k->Padding().IsZero() &&
+             query->ofm.quantization.scales.size() )
+        {
+            if ( req )
+            {
+                req->req.Set(ArchRequirement::Decompose);
+                req->decomposeProps.Set(ArchProperty::Scaling);
+            }
+            result.Set(QueryResult::HasRequirements);
+        }
     }
 
     return result;
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
index c5d1fea8..322cc6c7 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -717,7 +717,7 @@ void EthosU85RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp, bo
     QuantizedScale ofmScale(1, 0);
     pooling_mode mode = (poolOp->type == OpType::AvgPool && (poolOp->kernel.Size().x > 8 || poolOp->kernel.Size().y > 8)) ? pooling_mode::SUM : pooling_mode::NONE;
 
-    if ( mode == pooling_mode::SUM && useGlobalScale )
+    if ( mode == pooling_mode::SUM && useGlobalScale && poolOp->kernel.Padding().IsZero() )
     {
         uint32_t scale = 1;
         int shift = 0;
diff --git a/ethosu/regor/common/box.hpp b/ethosu/regor/common/box.hpp
index 84377977..60f2b29b 100644
--- a/ethosu/regor/common/box.hpp
+++ b/ethosu/regor/common/box.hpp
@@ -71,5 +71,25 @@ public:
         return true;
     }
 
+    void Move(const Shape &delta)
+    {
+        assert(delta.Size() == _start.Size());
+        _start += delta;
+        _end += delta;
+    }
+
+    void MoveTo(const Shape &start)
+    {
+        assert(start.Size() == _start.Size());
+        Move(start - _start);
+    }
+
+    Box Intersection(const Box &other)
+    {
+        return Overlaps(other) ? Box(Shape::Max(_start, other._start), Shape::Min(_end, other._end)) : Box{};
+    }
+
+    bool operator==(const Box &other) const { return (_start == other._start) && (_end == other._end); }
+
     std::string ToString() const { return fmt::format("[{} - {}]", _start.ToString(), _end.ToString()); }
 };
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index fce619f6..fe612ec2 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -20,6 +20,8 @@
 
 #include "common/logging.hpp"
 
+#include "architecture/ethos_u_scaling.hpp"
+#include "common/box.hpp"
 #include "shape_util.hpp"
 
 #include <numeric>
@@ -2290,7 +2292,131 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeAvgPool(Architecture *
         result.emplace_back(std::move(op));
         return result;
     }
-    // Decomposition for large dimensions & strides is needed here.
+
+    ArchRequirements req{};
+    Flags<QueryResult> qResult = OperatorQuery(arch, op.get(), &req);
+
+    // Perform scaling of the output if needed
+    const int scaleSize = ofmConn->quantization.scales.size();
+    if ( qResult.Any(QueryResult::HasRequirements) && req.decomposeProps.Any(ArchProperty::Scaling, ArchProperty::KernelStride) && scaleSize )
+    {
+        // Create scaling array
+        int H = (kernel->Padding().Top() || kernel->Padding().Bottom()) ? ofmShape.Height() : 1;
+        int W = (kernel->Padding().Left() || kernel->Padding().Right()) ? ofmShape.Width() : 1;
+        int C = scaleSize > 1 ? ofmShape.Depth() : 1;
+
+        // Create SchedulerTensor for scales and shifts
+        auto shape = Shape(1, H, W, C);
+        auto scaleTensor = std::make_shared<SchedulerTensor>();
+        scaleTensor->uid = GenerateUniqueId();
+        scaleTensor->memArea = arch->ReadonlyMemory();
+        scaleTensor->dataType = DataType::Int32;
+        scaleTensor->storageShape = shape;
+        auto shiftTensor = scaleTensor->Clone();
+
+        // Create buffers that will hold scales and shifts
+        const auto size = shape.Elements();
+        auto scaleBuffer = std::make_unique<int32_t[]>(size);
+        auto shiftBuffer = std::make_unique<int32_t[]>(size);
+
+        // Calculate scales and shifts
+        auto ifmBox = Box(Shape{ifmShape.Height(), ifmShape.Width()});
+        auto kernelBox = Box(Shape{kernel->Size().y, kernel->Size().x});
+        const auto &ifmScales = ifmConn->quantization.scales;
+        const auto &ofmScales = ofmConn->quantization.scales;
+        int pos = 0;
+        for ( int y = 0; y < H; y++ )
+        {
+            int iy = y * kernel->Stride().y - padding.Top();
+            for ( int x = 0; x < W; x++ )
+            {
+                int ix = x * kernel->Stride().x - padding.Left();
+                kernelBox.MoveTo(Shape{iy, ix});
+                assert(ifmBox.Overlaps(kernelBox));
+                int elements = ifmBox.Intersection(kernelBox).SizeShape().Elements();
+                assert(elements);
+                for ( int c = 0; c < C; c++ )
+                {
+                    double ifmScale = float(ifmScales[(c + ifmSlice.offset.Depth()) % ifmScales.size()].Dequantize());
+                    double ofmScale = float(ofmScales[(c + ofmSlice.offset.Depth()) % ofmScales.size()].Dequantize());
+                    double rescale = ifmScale / ofmScale;
+                    // When there is only one kernel element and no rescale
+                    // the effective shift will be zero and no rounding will be performed
+                    // by the ASR, hence the scale needs to be initialized as below
+                    uint32_t scale = 1 << 30;
+                    int shift = 30;
+                    if ( !(elements == 1 && rescale == 1.0) )
+                        QuantizePoolingScale(elements, rescale, 0, scale, shift, 31);
+                    scaleBuffer[pos] = scale;
+                    shiftBuffer[pos] = shift - 30;
+                    pos++;
+                }
+            }
+        }
+
+        // Hand over buffers to the scale and shift tensors
+        scaleTensor->bufferView = BufferView(std::make_shared<Buffer>(std::move(scaleBuffer), size), 0, 8 * sizeof(int32_t), shape, {});
+        shiftTensor->bufferView = BufferView(std::make_shared<Buffer>(std::move(shiftBuffer), size), 0, 8 * sizeof(int32_t), shape, {});
+
+        // Setup intermediate tensors for scaling
+        auto mulIfm = std::make_shared<SchedulerTensor>();
+        mulIfm->uid = GenerateUniqueId();
+        mulIfm->memArea = arch->FeatureMapMemory();
+        mulIfm->dataType = DataType::Int32;
+        mulIfm->storageShape = ofmShape;
+        auto mulOfm = mulIfm->Clone();
+
+        // Apply scales
+        auto mul = std::make_unique<SchedulerOperation>(OpType::Mul);
+        mul->ConnectInput(TensorUsage::IFM0, mulIfm)->shape = mulIfm->storageShape;
+        mul->ConnectInput(TensorUsage::IFM1, scaleTensor)->shape = scaleTensor->storageShape;
+        auto mulOfmConn = mul->ConnectOutput(TensorUsage::OFM, mulOfm);
+        mulOfmConn->shape = mulOfm->storageShape;
+        mulOfmConn->quantization.scales.emplace_back(QuantizedScale{1, 30});
+        mulOfmConn->rounding = RoundMode::TRUNCATE_TO_LOWER;
+        auto mulOps = DecomposeElementwise(arch, std::move(mul));
+
+        // Apply shift
+        auto asr = std::make_unique<SchedulerOperation>(OpType::Asr);
+        asr->ConnectInput(TensorUsage::IFM0, mulOfm)->shape = mulOfm->storageShape;
+        asr->ConnectInput(TensorUsage::IFM1, shiftTensor)->shape = shiftTensor->storageShape;
+        *asr->ConnectOutput(TensorUsage::OFM, ofmConn->tensor) = *ofmConn;
+        asr->OFM()->quantization.scales = {QuantizedScale::Unit()};
+        auto asrOps = DecomposeElementwise(arch, std::move(asr));
+
+        // Redirect ofm to perform scaling and set unit scaling
+        ofmConn = op->ConnectOutput(TensorUsage::OFM, mulIfm);
+        ofmConn->quantization = Quantization::Unit();
+        // Remove scales to signal scaling is done elsewhere, i.e. with the MUL and ASR above
+        ofmConn->quantization.scales.clear();
+        ofmConn->SetType(DataType::None);  // Reset any data type on the connection, since the tensor has been replaced
+        auto subOps = DecomposeAvgPool(arch, std::move(op));
+        result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+        result.insert(result.end(), std::make_move_iterator(mulOps.begin()), std::make_move_iterator(mulOps.end()));
+        result.insert(result.end(), std::make_move_iterator(asrOps.begin()), std::make_move_iterator(asrOps.end()));
+        return result;
+    }
+
+    // Decomposition for large dimensions
+    try
+    {
+        if ( auto newBlockShape = NewOfmBlockShape(arch, op.get()) )
+        {
+            return DecomposeBlocks(arch, std::move(op), newBlockShape, DecomposeAvgPool);
+        }
+    }
+    catch ( const DecompositionFailure & )
+    {
+        UpdatePaddingAndIfmOffset(op.get());
+        result.emplace_back(std::move(op));
+        return result;
+    }
+    // Decomposition of large stride
+    if ( arch->Constraints()->SupportsAccumulatorSaveRestore() && req.decomposeProps.Any(ArchProperty::KernelStride) )
+    {
+        return DecomposeForStrides(arch, std::move(op), DecomposeAvgPool);
+    }
+
     // If we get here, decomposition has failed, the resulting operations will be executed on CPU
     UpdatePaddingAndIfmOffset(op.get());
     result.emplace_back(std::move(op));
diff --git a/ethosu/regor/test/CMakeLists.txt b/ethosu/regor/test/CMakeLists.txt
index a9cb9fd0..69e38340 100644
--- a/ethosu/regor/test/CMakeLists.txt
+++ b/ethosu/regor/test/CMakeLists.txt
@@ -64,6 +64,7 @@ add_catch_test(
         test_custom_operator_ethosu.cpp
         test_tflite_supported_operators.cpp
         test_passthrough.cpp
+        test_box.cpp
     DEPS
         test_common
 )
diff --git a/ethosu/regor/test/test_box.cpp b/ethosu/regor/test/test_box.cpp
new file mode 100644
index 00000000..9299d188
--- /dev/null
+++ b/ethosu/regor/test/test_box.cpp
@@ -0,0 +1,96 @@
+//
+// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/box.hpp"
+
+#include <catch_all.hpp>
+
+static std::ostream &operator<<(std::ostream &os, const Shape &item)
+{
+    os << item.ToString();
+    return os;
+}
+
+static std::ostream &operator<<(std::ostream &os, const Box &item)
+{
+    os << item.ToString();
+    return os;
+}
+
+TEST_CASE("Box: tests")
+{
+    Box a({0, 0}, {3, 5});
+    Box b({0, 0}, {2, 2});
+
+    SECTION("Construct")
+    {
+        REQUIRE(a.Start() == Shape{0, 0});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+        REQUIRE(b.Start() == Shape{0, 0});
+        REQUIRE(b.SizeShape() == Shape{2, 2});
+    }
+
+    SECTION("Move")
+    {
+        a.Move(Shape{-1, 1});
+        REQUIRE(a.Start() == Shape{-1, 1});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+        a.Move(Shape{1, 0});
+        REQUIRE(a.Start() == Shape{0, 1});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+    }
+
+    SECTION("MoveTo")
+    {
+        a.MoveTo(Shape{5, 6});
+        REQUIRE(a.Start() == Shape{5, 6});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+        a.MoveTo(Shape{-3, 7});
+        REQUIRE(a.Start() == Shape{-3, 7});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+        a.MoveTo(Shape{0, 0});
+        REQUIRE(a.Start() == Shape{0, 0});
+        REQUIRE(a.SizeShape() == Shape{3, 5});
+    }
+
+    SECTION("Overlaps")
+    {
+        REQUIRE(a.Overlaps(b));
+        b.MoveTo({3, 5});
+        REQUIRE(!a.Overlaps(b));
+        b.Move({-1, -1});
+        REQUIRE(a.Overlaps(b));
+        b.MoveTo({-2, -2});
+        REQUIRE(!a.Overlaps(b));
+        b.Move({1, 1});
+        REQUIRE(a.Overlaps(b));
+    }
+
+    SECTION("Intersection")
+    {
+        REQUIRE(a.Intersection(b) == Box({0, 0}, Box::Size({2, 2})));
+        b.MoveTo({3, 5});
+        REQUIRE(a.Intersection(b) == Box{});
+        b.Move({-1, -1});
+        REQUIRE(a.Intersection(b) == Box({2, 4}, Box::Size({1, 1})));
+        b.MoveTo({-1, -1});
+        REQUIRE(a.Intersection(b) == Box({0, 0}, Box::Size({1, 1})));
+        b.Move({2, 2});
+        REQUIRE(a.Intersection(b) == Box({1, 1}, Box::Size({2, 2})));
+    }
+}
-- 
GitLab