From ab36eec4b03d9685a2e0b8c3a372db3abb27501e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Davidsson?= <bjoern.davidsson@arm.com>
Date: Wed, 20 Nov 2024 11:06:25 +0100
Subject: [PATCH] MLBEDSW-9759: Decompose batch for AvgPool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add decomposition for AvgPool, handling batch > 1.
- Convert padding to offsets for AvgPool and MaxPool
  decomposition.

Change-Id: I47faaaddedb0295abc084e3e966daa53817c2586
Signed-off-by: Björn Davidsson <bjoern.davidsson@arm.com>
---
 ethosu/regor/compiler/scheduler_decompose.cpp | 59 +++++++++++++++----
 ethosu/regor/compiler/scheduler_decompose.hpp |  1 +
 ethosu/regor/compiler/scheduler_packing.cpp   |  3 +
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
index c3c74440..b16aa14e 100644
--- a/ethosu/regor/compiler/scheduler_decompose.cpp
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -245,6 +245,7 @@ bool CanDecompose(Architecture *, const SchedulerOperation *schedOp)
     if ( schedOp->Type() == OpType::ArgMax ) return true;
     if ( schedOp->Type() == OpType::Reverse ) return true;
     if ( schedOp->Type() == OpType::Transpose ) return true;
+    if ( schedOp->Type() == OpType::AvgPool ) return true;
     if ( schedOp->Type() == OpType::MaxPool ) return true;
     if ( schedOp->Type() == OpType::Resize ) return true;
     return false;
@@ -1812,33 +1813,65 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTranspose(Architecture
     return result;
 }
 
-std::vector<std::unique_ptr<SchedulerOperation>> DecomposeMaxPool(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeAvgPool(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
 {
     std::vector<std::unique_ptr<SchedulerOperation>> result;
-    auto ofmConn = op->Output(TensorUsage::OFM);
-    auto &ofmShape = ofmConn->SliceShape();
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    const auto &ofmShape = ofmConn->SliceShape();
+    const auto &ifmShape = ifmConn->SliceShape();
     auto &ofmSlice = ofmConn->slice;
-    auto ifmConn = op->Input(TensorUsage::IFM);
-    auto &ifmShape = ifmConn->SliceShape();
     auto &ifmSlice = ifmConn->slice;
-
+    auto *kernel = op->Kernel();
+    auto &padding = kernel->Padding();
     ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape);
-    ifmSlice.Initialize(ifmShape.WithZeros(), ifmShape);
-
-    if ( auto ifm2Conn = op->TryInput(TensorUsage::IFM1) )
+    ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape);
+    auto ofmRank = ofmShape.Size();
+    if ( ofmRank > 3 && (ofmShape.Elements() > ofmShape.Height() * ofmShape.Width() * ofmShape.Depth()) )
     {
-        auto &ifm2Shape = ifm2Conn->shape;
-        auto &ifm2Slice = ifm2Conn->slice;
+        return DecomposeLeadingDimensions(ofmRank - 3, arch, std::move(op), DecomposeAvgPool);
+    }
 
-        ifm2Slice.Initialize(ifm2Shape.WithZeros(), ifm2Shape);
+    if ( !NeedsDecompose(arch, op.get()) )
+    {
+        UpdatePaddingAndIfmOffset(op.get());
+        result.emplace_back(std::move(op));
+        return result;
     }
+    // Decomposition for large dimensions & strides is needed here.
+    // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+    UpdatePaddingAndIfmOffset(op.get());
+    result.emplace_back(std::move(op));
+    return result;
+}
 
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeMaxPool(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    const auto &ofmShape = ofmConn->SliceShape();
+    const auto &ifmShape = ifmConn->SliceShape();
+    auto &ofmSlice = ofmConn->slice;
+    auto &ifmSlice = ifmConn->slice;
+    auto *kernel = op->Kernel();
+    auto &padding = kernel->Padding();
+    ofmSlice.Initialize(ofmShape.WithZeros(), ofmShape);
+    ifmSlice.Initialize(ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()), ifmShape);
     auto ofmRank = ofmShape.Size();
     if ( ofmRank > 3 && (ofmShape.Elements() > ofmShape.Height() * ofmShape.Width() * ofmShape.Depth()) )
     {
         return DecomposeLeadingDimensions(ofmRank - 3, arch, std::move(op), DecomposeMaxPool);
     }
-
+    if ( !NeedsDecompose(arch, op.get()) )
+    {
+        UpdatePaddingAndIfmOffset(op.get());
+        result.emplace_back(std::move(op));
+        return result;
+    }
+    // Decomposition for large dimensions & strides is needed here.
+    // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+    UpdatePaddingAndIfmOffset(op.get());
     result.emplace_back(std::move(op));
     return result;
 }
diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp
index baf3d81e..7dca942d 100644
--- a/ethosu/regor/compiler/scheduler_decompose.hpp
+++ b/ethosu/regor/compiler/scheduler_decompose.hpp
@@ -44,6 +44,7 @@ std::vector<std::unique_ptr<SchedulerOperation>> DecomposeMatmul(Architecture *a
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeReduce(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeReverse(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeTranspose(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeAvgPool(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeMaxPool(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> DecomposeResize(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
 std::vector<std::unique_ptr<SchedulerOperation>> LegaliseResize(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
index 550f8df2..3de2f3df 100644
--- a/ethosu/regor/compiler/scheduler_packing.cpp
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -691,6 +691,9 @@ std::vector<std::unique_ptr<SchedulerOperation>> SchedulerPacking::DecomposeSche
         case OpType::Transpose:
             result = DecomposeTranspose(_arch, std::move(op));
             break;
+        case OpType::AvgPool:
+            result = DecomposeAvgPool(_arch, std::move(op));
+            break;
         case OpType::MaxPool:
             result = DecomposeMaxPool(_arch, std::move(op));
             break;
-- 
GitLab